mirror of https://github.com/apache/lucene.git
LUCENE-7997: More sanity testing of similarities
This commit is contained in:
parent
81a4f7cc9c
commit
42717d5f4b
|
@ -11,7 +11,12 @@ Changes in Runtime Behavior
|
|||
will now fail to open even if they have been merged with the previous major
|
||||
version. (Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
SimilarityBase switches to 64-bit doubles internally to help avoid common numeric issues.
|
||||
Add missing range checks for similarity parameters.
|
||||
Improve BM25 and ClassicSimilarity's explanations. (Robert Muir)
|
||||
|
||||
======================= Lucene 7.2.0 =======================
|
||||
|
||||
|
|
|
@ -73,4 +73,22 @@ public class CollectionStatistics {
|
|||
public final long sumDocFreq() {
|
||||
return sumDocFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("field=");
|
||||
sb.append('"');
|
||||
sb.append(field());
|
||||
sb.append('"');
|
||||
sb.append(",maxDoc=");
|
||||
sb.append(maxDoc());
|
||||
sb.append(",docCount=");
|
||||
sb.append(docCount());
|
||||
sb.append(",sumTotalTermFreq=");
|
||||
sb.append(sumTotalTermFreq());
|
||||
sb.append(",sumDocFreq=");
|
||||
sb.append(sumDocFreq);
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -142,7 +142,7 @@ public class TermQuery extends Query {
|
|||
if (newDoc == doc) {
|
||||
float freq = scorer.freq();
|
||||
SimScorer docScorer = similarity.simScorer(stats, context);
|
||||
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
|
||||
Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document");
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(
|
||||
scoreExplanation.getValue(),
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum; // javadocs
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
/**
|
||||
|
@ -52,4 +53,18 @@ public class TermStatistics {
|
|||
public final long totalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("term=");
|
||||
sb.append('"');
|
||||
sb.append(Term.toString(term()));
|
||||
sb.append('"');
|
||||
sb.append(",docFreq=");
|
||||
sb.append(docFreq());
|
||||
sb.append(",totalTermFreq=");
|
||||
sb.append(totalTermFreq());
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,10 +38,10 @@ public abstract class AfterEffect {
|
|||
public AfterEffect() {}
|
||||
|
||||
/** Returns the aftereffect score. */
|
||||
public abstract float score(BasicStats stats, float tfn);
|
||||
public abstract double score(BasicStats stats, double tfn);
|
||||
|
||||
/** Returns an explanation for the score. */
|
||||
public abstract Explanation explain(BasicStats stats, float tfn);
|
||||
public abstract Explanation explain(BasicStats stats, double tfn);
|
||||
|
||||
/** Implementation used when there is no aftereffect. */
|
||||
public static final class NoAfterEffect extends AfterEffect {
|
||||
|
@ -50,12 +50,12 @@ public abstract class AfterEffect {
|
|||
public NoAfterEffect() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
return 1f;
|
||||
public double score(BasicStats stats, double tfn) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
public Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(1, "no aftereffect");
|
||||
}
|
||||
|
||||
|
|
|
@ -29,18 +29,18 @@ public class AfterEffectB extends AfterEffect {
|
|||
public AfterEffectB() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
long F = stats.getTotalTermFreq()+1;
|
||||
long n = stats.getDocFreq()+1;
|
||||
return (F + 1) / (n * (tfn + 1));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
score(stats, tfn),
|
||||
(float) score(stats, tfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(tfn, "tfn"),
|
||||
Explanation.match((float) tfn, "tfn"),
|
||||
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
|
||||
Explanation.match(stats.getDocFreq(), "docFreq"));
|
||||
}
|
||||
|
|
|
@ -29,16 +29,16 @@ public class AfterEffectL extends AfterEffect {
|
|||
public AfterEffectL() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
return 1 / (tfn + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
score(stats, tfn),
|
||||
(float) score(stats, tfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(tfn, "tfn"));
|
||||
Explanation.match((float) tfn, "tfn"));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -100,7 +100,7 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float score(BasicStats stats, float freq, float docLen) {
|
||||
public double score(BasicStats stats, double freq, double docLen) {
|
||||
return tf(stats, freq, docLen)
|
||||
* ln(stats, freq, docLen)
|
||||
* tfln(stats, freq, docLen)
|
||||
|
@ -110,19 +110,19 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
|
||||
}
|
||||
|
||||
subs.add(Explanation.match(this.k, "k"));
|
||||
subs.add(Explanation.match(this.s, "s"));
|
||||
subs.add(Explanation.match(this.queryLen, "queryLen"));
|
||||
subs.add(Explanation.match(tf(stats, freq, docLen), "tf"));
|
||||
subs.add(Explanation.match(ln(stats, freq, docLen), "ln"));
|
||||
subs.add(Explanation.match(tfln(stats, freq, docLen), "tfln"));
|
||||
subs.add(Explanation.match(idf(stats, freq, docLen), "idf"));
|
||||
subs.add(Explanation.match(gamma(stats, freq, docLen), "gamma"));
|
||||
subs.add(Explanation.match((float) tf(stats, freq, docLen), "tf"));
|
||||
subs.add(Explanation.match((float) ln(stats, freq, docLen), "ln"));
|
||||
subs.add(Explanation.match((float) tfln(stats, freq, docLen), "tfln"));
|
||||
subs.add(Explanation.match((float) idf(stats, freq, docLen), "idf"));
|
||||
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
|
||||
super.explain(subs, stats, doc, freq, docLen);
|
||||
}
|
||||
|
||||
|
@ -135,25 +135,25 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
/**
|
||||
* compute the term frequency component
|
||||
*/
|
||||
protected abstract float tf(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double tf(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
protected abstract float ln(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double ln(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
protected abstract float tfln(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double tfln(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* compute the inverted document frequency component
|
||||
*/
|
||||
protected abstract float idf(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double idf(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* compute the gamma component (only for F3EXp and F3LOG)
|
||||
*/
|
||||
protected abstract float gamma(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double gamma(BasicStats stats, double freq, double docLen);
|
||||
}
|
|
@ -56,16 +56,16 @@ public class AxiomaticF1EXP extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
if (freq <= 0.0) return 0f;
|
||||
return (float) (1 + Math.log(1 + Math.log(freq)));
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
if (freq <= 0.0) return 0.0;
|
||||
return 1 + Math.log(1 + Math.log(freq));
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s);
|
||||
}
|
||||
|
||||
|
@ -73,23 +73,23 @@ public class AxiomaticF1EXP extends Axiomatic {
|
|||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
return 0f;
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -49,16 +49,16 @@ public class AxiomaticF1LOG extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
if (freq <= 0.0) return 0f;
|
||||
return (float) (1 + Math.log(1 + Math.log(freq)));
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
if (freq <= 0.0) return 0.0;
|
||||
return 1 + Math.log(1 + Math.log(freq));
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s);
|
||||
}
|
||||
|
||||
|
@ -66,23 +66,23 @@ public class AxiomaticF1LOG extends Axiomatic {
|
|||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
return 0f;
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -56,23 +56,23 @@ public class AxiomaticF2EXP extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength());
|
||||
}
|
||||
|
||||
|
@ -80,15 +80,15 @@ public class AxiomaticF2EXP extends Axiomatic {
|
|||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
return 0f;
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -48,23 +48,23 @@ public class AxiomaticF2LOG extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength());
|
||||
}
|
||||
|
||||
|
@ -72,15 +72,15 @@ public class AxiomaticF2LOG extends Axiomatic {
|
|||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
return 0f;
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -17,10 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
|
||||
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
|
||||
* where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq
|
||||
* gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl
|
||||
*
|
||||
* NOTE: the gamma function of this similarity creates negative scores
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AxiomaticF3EXP extends Axiomatic {
|
||||
|
@ -55,40 +55,40 @@ public class AxiomaticF3EXP extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
if (freq <= 0.0) return 0f;
|
||||
return (float) (1 + Math.log(1 + Math.log(freq)));
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
if (freq <= 0.0) return 0.0;
|
||||
return 1 + Math.log(1 + Math.log(freq));
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
|
||||
}
|
||||
}
|
|
@ -17,10 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
|
||||
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
|
||||
* where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq
|
||||
* gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl
|
||||
*
|
||||
* NOTE: the gamma function of this similarity creates negative scores
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AxiomaticF3LOG extends Axiomatic {
|
||||
|
@ -44,40 +44,40 @@ public class AxiomaticF3LOG extends Axiomatic {
|
|||
* compute the term frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float tf(BasicStats stats, float freq, float docLen) {
|
||||
if (freq <= 0.0) return 0f;
|
||||
return (float) (1 + Math.log(1 + Math.log(freq)));
|
||||
protected double tf(BasicStats stats, double freq, double docLen) {
|
||||
if (freq <= 0.0) return 0.0;
|
||||
return 1 + Math.log(1 + Math.log(freq));
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the document length component
|
||||
*/
|
||||
@Override
|
||||
protected float ln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double ln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the mixed term frequency and document length component
|
||||
*/
|
||||
@Override
|
||||
protected float tfln(BasicStats stats, float freq, float docLen) {
|
||||
return 1f;
|
||||
protected double tfln(BasicStats stats, double freq, double docLen) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the inverted document frequency component
|
||||
*/
|
||||
@Override
|
||||
protected float idf(BasicStats stats, float freq, float docLen) {
|
||||
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
protected double idf(BasicStats stats, double freq, double docLen) {
|
||||
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the gamma component
|
||||
*/
|
||||
@Override
|
||||
protected float gamma(BasicStats stats, float freq, float docLen) {
|
||||
protected double gamma(BasicStats stats, double freq, double docLen) {
|
||||
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
|
||||
}
|
||||
}
|
|
@ -159,9 +159,9 @@ public class BM25Similarity extends Similarity {
|
|||
final long df = termStats.docFreq();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
|
||||
Explanation.match(df, "docFreq"),
|
||||
Explanation.match(docCount, "docCount"));
|
||||
return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
|
||||
Explanation.match(df, "n, number of documents containing term"),
|
||||
Explanation.match(docCount, "N, total number of documents with field"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -185,7 +185,7 @@ public class BM25Similarity extends Similarity {
|
|||
details.add(idfExplain);
|
||||
idf += idfExplain.getValue();
|
||||
}
|
||||
return Explanation.match((float) idf, "idf(), sum of:", details);
|
||||
return Explanation.match((float) idf, "idf, sum of:", details);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -197,7 +197,7 @@ public class BM25Similarity extends Similarity {
|
|||
for (int i = 0; i < cache.length; i++) {
|
||||
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
|
||||
}
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
|
||||
return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -217,7 +217,7 @@ public class BM25Similarity extends Similarity {
|
|||
|
||||
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1);
|
||||
this.weightValue = stats.weight;
|
||||
this.norms = norms;
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
|
@ -226,7 +226,7 @@ public class BM25Similarity extends Similarity {
|
|||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
// if there are no norms, we act as if b=0
|
||||
float norm;
|
||||
double norm;
|
||||
if (norms == null) {
|
||||
norm = k1;
|
||||
} else {
|
||||
|
@ -236,12 +236,48 @@ public class BM25Similarity extends Similarity {
|
|||
norm = cache[0];
|
||||
}
|
||||
}
|
||||
return weightValue * freq / (freq + norm);
|
||||
return weightValue * (float) (freq / (freq + norm));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms, lengthCache);
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.addAll(stats.explain());
|
||||
Explanation tfExpl = explainTF(doc, freq);
|
||||
subs.add(tfExpl);
|
||||
return Explanation.match(stats.weight * tfExpl.getValue(),
|
||||
"score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs);
|
||||
}
|
||||
|
||||
private Explanation explainTF(int doc, Explanation freq) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
|
||||
if (norms == null) {
|
||||
subs.add(Explanation.match(0, "b, field omits length norms"));
|
||||
return Explanation.match(
|
||||
(float) (freq.getValue() / (freq.getValue() + (double) k1)),
|
||||
"tf, computed as freq / (freq + k1) from:", subs);
|
||||
} else {
|
||||
byte norm;
|
||||
if (norms.advanceExact(doc)) {
|
||||
norm = (byte) norms.longValue();
|
||||
} else {
|
||||
norm = 0;
|
||||
}
|
||||
float doclen = lengthCache[norm & 0xff];
|
||||
subs.add(Explanation.match(b, "b, length normalization parameter"));
|
||||
if ((norm & 0xFF) > 39) {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
|
||||
} else {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field"));
|
||||
}
|
||||
subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field"));
|
||||
float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl);
|
||||
return Explanation.match(
|
||||
(float) (freq.getValue() / (freq.getValue() + (double) normValue)),
|
||||
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -257,69 +293,45 @@ public class BM25Similarity extends Similarity {
|
|||
|
||||
/** Collection statistics for the BM25 model. */
|
||||
private static class BM25Stats extends SimWeight {
|
||||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
/** query boost */
|
||||
private final float boost;
|
||||
/** k1 value for scale factor */
|
||||
private final float k1;
|
||||
/** BM25's idf */
|
||||
private final Explanation idf;
|
||||
/** The average document length. */
|
||||
private final float avgdl;
|
||||
/** query boost */
|
||||
private final float boost;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float[] cache;
|
||||
/** weight (idf * boost) */
|
||||
private final float weight;
|
||||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
|
||||
* for LENGTH_TABLE */
|
||||
private final float[] cache;
|
||||
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) {
|
||||
BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) {
|
||||
this.field = field;
|
||||
this.boost = boost;
|
||||
this.idf = idf;
|
||||
this.avgdl = avgdl;
|
||||
this.weight = idf.getValue() * boost;
|
||||
this.k1 = k1;
|
||||
this.cache = cache;
|
||||
this.weight = (k1 + 1) * boost * idf.getValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "parameter k1"));
|
||||
if (norms == null) {
|
||||
subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
|
||||
return Explanation.match(
|
||||
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
|
||||
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
|
||||
} else {
|
||||
byte norm;
|
||||
if (norms.advanceExact(doc)) {
|
||||
norm = (byte) norms.longValue();
|
||||
} else {
|
||||
norm = 0;
|
||||
private List<Explanation> explain() {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
// scale factor
|
||||
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
|
||||
// query boost
|
||||
if (boost != 1.0f) {
|
||||
subs.add(Explanation.match(boost, "boost"));
|
||||
}
|
||||
float doclen = lengthCache[norm & 0xff];
|
||||
subs.add(Explanation.match(b, "parameter b"));
|
||||
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
|
||||
subs.add(Explanation.match(doclen, "fieldLength"));
|
||||
return Explanation.match(
|
||||
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
|
||||
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
|
||||
// idf
|
||||
subs.add(idf);
|
||||
return subs;
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
Explanation boostExpl = Explanation.match(stats.boost, "boost");
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
if (boostExpl.getValue() != 1.0f)
|
||||
subs.add(boostExpl);
|
||||
subs.add(stats.idf);
|
||||
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
|
||||
subs.add(tfNormExpl);
|
||||
return Explanation.match(
|
||||
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
|
||||
"score(doc="+doc+",freq="+freq+"), product of:", subs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -37,7 +37,7 @@ public abstract class BasicModel {
|
|||
public BasicModel() {}
|
||||
|
||||
/** Returns the informative content score. */
|
||||
public abstract float score(BasicStats stats, float tfn);
|
||||
public abstract double score(BasicStats stats, double tfn);
|
||||
|
||||
/**
|
||||
* Returns an explanation for the score.
|
||||
|
@ -46,9 +46,9 @@ public abstract class BasicModel {
|
|||
* explanation for such models. Subclasses that use other statistics must
|
||||
* override this method.</p>
|
||||
*/
|
||||
public Explanation explain(BasicStats stats, float tfn) {
|
||||
public Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
score(stats, tfn),
|
||||
(float) score(stats, tfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
|
||||
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));
|
||||
|
|
|
@ -35,11 +35,11 @@ public class BasicModelBE extends BasicModel {
|
|||
public BasicModelBE() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
double F = stats.getTotalTermFreq() + 1 + tfn;
|
||||
// approximation only holds true when F << N, so we use N += F
|
||||
double N = F + stats.getNumberOfDocuments();
|
||||
return (float)(-log2((N - 1) * Math.E)
|
||||
return (-log2((N - 1) * Math.E)
|
||||
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
|
||||
}
|
||||
|
||||
|
|
|
@ -37,16 +37,16 @@ public class BasicModelD extends BasicModel {
|
|||
public BasicModelD() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
|
||||
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
|
||||
// to create a 'normalized' F.
|
||||
double F = stats.getTotalTermFreq() + 1 + tfn;
|
||||
double phi = (double)tfn / F;
|
||||
double phi = tfn / F;
|
||||
double nphi = 1 - phi;
|
||||
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
|
||||
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
|
||||
return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi));
|
||||
return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,13 +31,13 @@ public class BasicModelG extends BasicModel {
|
|||
public BasicModelG() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
|
||||
double F = stats.getTotalTermFreq() + 1;
|
||||
double N = stats.getNumberOfDocuments();
|
||||
double lambda = F / (N + F);
|
||||
// -log(1 / (lambda + 1)) -> log(lambda + 1)
|
||||
return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
|
||||
return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -29,10 +29,10 @@ public class BasicModelIF extends BasicModel {
|
|||
public BasicModelIF() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
|
||||
return tfn * log2(1 + (N + 1) / (F + 0.5));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,16 +30,16 @@ public class BasicModelIn extends BasicModel {
|
|||
public BasicModelIn() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long n = stats.getDocFreq();
|
||||
return tfn * (float)(log2((N + 1) / (n + 0.5)));
|
||||
return tfn * log2((N + 1) / (n + 0.5));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
score(stats, tfn),
|
||||
(float) score(stats, tfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
|
||||
Explanation.match(stats.getDocFreq(), "docFreq"));
|
||||
|
|
|
@ -30,11 +30,11 @@ public class BasicModelIne extends BasicModel {
|
|||
public BasicModelIne() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
|
||||
return tfn * (float)(log2((N + 1) / (ne + 0.5)));
|
||||
return tfn * log2((N + 1) / (ne + 0.5));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -35,11 +35,11 @@ public class BasicModelP extends BasicModel {
|
|||
public BasicModelP() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1);
|
||||
return (float)(tfn * log2(tfn / lambda)
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1);
|
||||
return tfn * log2(tfn / lambda)
|
||||
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
|
||||
+ 0.5 * log2(2 * Math.PI * tfn));
|
||||
+ 0.5 * log2(2 * Math.PI * tfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,7 +30,7 @@ public class BasicStats extends Similarity.SimWeight {
|
|||
/** The total number of tokens in the field. */
|
||||
protected long numberOfFieldTokens;
|
||||
/** The average field length. */
|
||||
protected float avgFieldLength;
|
||||
protected double avgFieldLength;
|
||||
/** The document frequency. */
|
||||
protected long docFreq;
|
||||
/** The total number of occurrences of this term across all documents. */
|
||||
|
@ -39,10 +39,10 @@ public class BasicStats extends Similarity.SimWeight {
|
|||
// -------------------------- Boost-related stuff --------------------------
|
||||
|
||||
/** A query boost. Should be applied as a multiplicative factor to the score. */
|
||||
protected final float boost;
|
||||
protected final double boost;
|
||||
|
||||
/** Constructor. */
|
||||
public BasicStats(String field, float boost) {
|
||||
public BasicStats(String field, double boost) {
|
||||
this.field = field;
|
||||
this.boost = boost;
|
||||
}
|
||||
|
@ -76,12 +76,12 @@ public class BasicStats extends Similarity.SimWeight {
|
|||
}
|
||||
|
||||
/** Returns the average field length. */
|
||||
public float getAvgFieldLength() {
|
||||
public double getAvgFieldLength() {
|
||||
return avgFieldLength;
|
||||
}
|
||||
|
||||
/** Sets the average field length. */
|
||||
public void setAvgFieldLength(float avgFieldLength) {
|
||||
public void setAvgFieldLength(double avgFieldLength) {
|
||||
this.avgFieldLength = avgFieldLength;
|
||||
}
|
||||
|
||||
|
@ -106,7 +106,7 @@ public class BasicStats extends Similarity.SimWeight {
|
|||
}
|
||||
|
||||
/** Returns the total boost. */
|
||||
public float getBoost() {
|
||||
public double getBoost() {
|
||||
return boost;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ public class BooleanSimilarity extends Similarity {
|
|||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
Explanation queryBoostExpl = Explanation.match(boost, "query boost");
|
||||
Explanation queryBoostExpl = Explanation.match(boost, "boost");
|
||||
return Explanation.match(
|
||||
queryBoostExpl.getValue(),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
|
||||
|
|
|
@ -65,8 +65,8 @@ public class ClassicSimilarity extends TFIDFSimilarity {
|
|||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
|
||||
Explanation.match(df, "docFreq"),
|
||||
Explanation.match(docCount, "docCount"));
|
||||
Explanation.match(df, "docFreq, number of documents containing term"),
|
||||
Explanation.match(docCount, "docCount, total number of documents with field"));
|
||||
}
|
||||
|
||||
/** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
|
||||
|
|
|
@ -50,16 +50,16 @@ public class DFISimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
|
||||
final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
|
||||
final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
|
||||
|
||||
// if the observed frequency is less than or equal to the expected value, then return zero.
|
||||
if (freq <= expected) return 0;
|
||||
|
||||
final float measure = independence.score(freq, expected);
|
||||
final double measure = independence.score(freq, expected);
|
||||
|
||||
return stats.getBoost() * (float) log2(measure + 1);
|
||||
return stats.getBoost() * log2(measure + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -107,17 +107,17 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
float tfn = normalization.tfn(stats, freq, docLen);
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
double tfn = normalization.tfn(stats, freq, docLen);
|
||||
return stats.getBoost() *
|
||||
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs,
|
||||
BasicStats stats, int doc, float freq, float docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
BasicStats stats, int doc, double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match( (float)stats.getBoost(), "boost"));
|
||||
}
|
||||
|
||||
Explanation normExpl = normalization.explain(stats, freq, docLen);
|
||||
|
|
|
@ -34,13 +34,13 @@ public abstract class Distribution {
|
|||
public Distribution() {}
|
||||
|
||||
/** Computes the score. */
|
||||
public abstract float score(BasicStats stats, float tfn, float lambda);
|
||||
public abstract double score(BasicStats stats, double tfn, double lambda);
|
||||
|
||||
/** Explains the score. Returns the name of the model only, since
|
||||
* both {@code tfn} and {@code lambda} are explained elsewhere. */
|
||||
public Explanation explain(BasicStats stats, float tfn, float lambda) {
|
||||
return Explanation.match(
|
||||
score(stats, tfn, lambda), getClass().getSimpleName());
|
||||
public Explanation explain(BasicStats stats, double tfn, double lambda) {
|
||||
return Explanation.match((float)score(stats, tfn, lambda),
|
||||
getClass().getSimpleName());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -30,8 +30,8 @@ public class DistributionLL extends Distribution {
|
|||
public DistributionLL() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn, float lambda) {
|
||||
return (float)-Math.log(lambda / (tfn + lambda));
|
||||
public final double score(BasicStats stats, double tfn, double lambda) {
|
||||
return -Math.log(lambda / (tfn + lambda));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -33,11 +33,11 @@ public class DistributionSPL extends Distribution {
|
|||
public DistributionSPL() {}
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn, float lambda) {
|
||||
if (lambda == 1f) {
|
||||
lambda = 0.99f;
|
||||
public final double score(BasicStats stats, double tfn, double lambda) {
|
||||
if (lambda == 1d) {
|
||||
lambda = 0.99d;
|
||||
}
|
||||
return (float)-Math.log(
|
||||
return -Math.log(
|
||||
(Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda));
|
||||
}
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ public class IBSimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
return stats.getBoost() *
|
||||
distribution.score(
|
||||
stats,
|
||||
|
@ -105,9 +105,9 @@ public class IBSimilarity extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected void explain(
|
||||
List<Explanation> subs, BasicStats stats, int doc, float freq, float docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float)stats.getBoost(), "boost"));
|
||||
}
|
||||
Explanation normExpl = normalization.explain(stats, freq, docLen);
|
||||
Explanation lambdaExpl = lambda.explain(stats);
|
||||
|
|
|
@ -38,7 +38,7 @@ public abstract class Independence {
|
|||
* @param freq actual term frequency
|
||||
* @param expected expected term frequency
|
||||
*/
|
||||
public abstract float score(float freq, float expected);
|
||||
public abstract double score(double freq, double expected);
|
||||
|
||||
// subclasses must provide a name
|
||||
@Override
|
||||
|
|
|
@ -33,7 +33,7 @@ public class IndependenceChiSquared extends Independence {
|
|||
public IndependenceChiSquared() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
public double score(double freq, double expected) {
|
||||
return (freq - expected) * (freq - expected) / expected;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ public class IndependenceSaturated extends Independence {
|
|||
public IndependenceSaturated() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
public double score(double freq, double expected) {
|
||||
return (freq - expected) / expected;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ public class IndependenceStandardized extends Independence {
|
|||
public IndependenceStandardized() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
return (freq - expected) / (float) Math.sqrt(expected);
|
||||
public double score(double freq, double expected) {
|
||||
return (freq - expected) / Math.sqrt(expected);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -44,11 +44,17 @@ public class LMDirichletSimilarity extends LMSimilarity {
|
|||
/** Instantiates the similarity with the provided μ parameter. */
|
||||
public LMDirichletSimilarity(CollectionModel collectionModel, float mu) {
|
||||
super(collectionModel);
|
||||
if (Float.isFinite(mu) == false || mu < 0) {
|
||||
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
|
||||
}
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the provided μ parameter. */
|
||||
public LMDirichletSimilarity(float mu) {
|
||||
if (Float.isFinite(mu) == false || mu < 0) {
|
||||
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
|
||||
}
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
|
@ -63,18 +69,18 @@ public class LMDirichletSimilarity extends LMSimilarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
float score = stats.getBoost() * (float)(Math.log(1 + freq /
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
double score = stats.getBoost() * (Math.log(1 + freq /
|
||||
(mu * ((LMStats)stats).getCollectionProbability())) +
|
||||
Math.log(mu / (docLen + mu)));
|
||||
return score > 0.0f ? score : 0.0f;
|
||||
return score > 0.0d ? score : 0.0d;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
|
||||
}
|
||||
|
||||
subs.add(Explanation.match(mu, "mu"));
|
||||
|
|
|
@ -31,7 +31,9 @@ import org.apache.lucene.search.Explanation;
|
|||
* <p>The model has a single parameter, λ. According to said paper, the
|
||||
* optimal value depends on both the collection and the query. The optimal value
|
||||
* is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
|
||||
*
|
||||
* <p>Values should be between 0 (exclusive) and 1 (inclusive). Values near zero act score more
|
||||
* like a conjunction (coordinate level matching), whereas values near 1 behave
|
||||
* the opposite (more like pure disjunction).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LMJelinekMercerSimilarity extends LMSimilarity {
|
||||
|
@ -42,27 +44,33 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
|
|||
public LMJelinekMercerSimilarity(
|
||||
CollectionModel collectionModel, float lambda) {
|
||||
super(collectionModel);
|
||||
if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) {
|
||||
throw new IllegalArgumentException("lambda must be in the range (0 .. 1]");
|
||||
}
|
||||
this.lambda = lambda;
|
||||
}
|
||||
|
||||
/** Instantiates with the specified λ parameter. */
|
||||
public LMJelinekMercerSimilarity(float lambda) {
|
||||
if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) {
|
||||
throw new IllegalArgumentException("lambda must be in the range (0 .. 1]");
|
||||
}
|
||||
this.lambda = lambda;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
return stats.getBoost() *
|
||||
(float)Math.log(1 +
|
||||
Math.log(1 +
|
||||
((1 - lambda) * freq / docLen) /
|
||||
(lambda * ((LMStats)stats).getCollectionProbability()));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
|
||||
}
|
||||
subs.add(Explanation.match(lambda, "lambda"));
|
||||
super.explain(subs, stats, doc, freq, docLen);
|
||||
|
|
|
@ -54,7 +54,7 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected BasicStats newStats(String field, float boost) {
|
||||
protected BasicStats newStats(String field, double boost) {
|
||||
return new LMStats(field, boost);
|
||||
}
|
||||
|
||||
|
@ -71,8 +71,8 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subExpls, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
subExpls.add(Explanation.match(collectionModel.computeProbability(stats),
|
||||
double freq, double docLen) {
|
||||
subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats),
|
||||
"collection probability"));
|
||||
}
|
||||
|
||||
|
@ -103,12 +103,12 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
/** Stores the collection distribution of the current term. */
|
||||
public static class LMStats extends BasicStats {
|
||||
/** The probability that the current term is generated by the collection. */
|
||||
private float collectionProbability;
|
||||
private double collectionProbability;
|
||||
|
||||
/**
|
||||
* Creates LMStats for the provided field and query-time boost
|
||||
*/
|
||||
public LMStats(String field, float boost) {
|
||||
public LMStats(String field, double boost) {
|
||||
super(field, boost);
|
||||
}
|
||||
|
||||
|
@ -116,7 +116,7 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
* Returns the probability that the current term is generated by the
|
||||
* collection.
|
||||
*/
|
||||
public final float getCollectionProbability() {
|
||||
public final double getCollectionProbability() {
|
||||
return collectionProbability;
|
||||
}
|
||||
|
||||
|
@ -124,7 +124,7 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
* Sets the probability that the current term is generated by the
|
||||
* collection.
|
||||
*/
|
||||
public final void setCollectionProbability(float collectionProbability) {
|
||||
public final void setCollectionProbability(double collectionProbability) {
|
||||
this.collectionProbability = collectionProbability;
|
||||
}
|
||||
}
|
||||
|
@ -135,7 +135,7 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
* Computes the probability {@code p(w|C)} according to the language model
|
||||
* strategy for the current term.
|
||||
*/
|
||||
public float computeProbability(BasicStats stats);
|
||||
public double computeProbability(BasicStats stats);
|
||||
|
||||
/** The name of the collection model strategy. */
|
||||
public String getName();
|
||||
|
@ -151,8 +151,8 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
public DefaultCollectionModel() {}
|
||||
|
||||
@Override
|
||||
public float computeProbability(BasicStats stats) {
|
||||
return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F);
|
||||
public double computeProbability(BasicStats stats) {
|
||||
return (stats.getTotalTermFreq()+1D) / (stats.getNumberOfFieldTokens()+1D);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -36,7 +36,7 @@ public abstract class Normalization {
|
|||
|
||||
/** Returns the normalized term frequency.
|
||||
* @param len the field length. */
|
||||
public abstract float tfn(BasicStats stats, float tf, float len);
|
||||
public abstract double tfn(BasicStats stats, double tf, double len);
|
||||
|
||||
/** Returns an explanation for the normalized term frequency.
|
||||
* <p>The default normalization methods use the field length of the document
|
||||
|
@ -44,13 +44,13 @@ public abstract class Normalization {
|
|||
* This method provides a generic explanation for such methods.
|
||||
* Subclasses that use other statistics must override this method.</p>
|
||||
*/
|
||||
public Explanation explain(BasicStats stats, float tf, float len) {
|
||||
public Explanation explain(BasicStats stats, double tf, double len) {
|
||||
return Explanation.match(
|
||||
tfn(stats, tf, len),
|
||||
(float) tfn(stats, tf, len),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(tf, "tf"),
|
||||
Explanation.match(stats.getAvgFieldLength(), "avgFieldLength"),
|
||||
Explanation.match(len, "len"));
|
||||
Explanation.match((float) tf, "tf"),
|
||||
Explanation.match((float) stats.getAvgFieldLength(), "avgFieldLength"),
|
||||
Explanation.match((float) len, "len"));
|
||||
}
|
||||
|
||||
/** Implementation used when there is no normalization. */
|
||||
|
@ -60,12 +60,12 @@ public abstract class Normalization {
|
|||
public NoNormalization() {}
|
||||
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
public double tfn(BasicStats stats, double tf, double len) {
|
||||
return tf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tf, float len) {
|
||||
public Explanation explain(BasicStats stats, double tf, double len) {
|
||||
return Explanation.match(1, "no normalization");
|
||||
}
|
||||
|
||||
|
|
|
@ -36,6 +36,10 @@ public class NormalizationH1 extends Normalization {
|
|||
* normalization with respect to the document length.
|
||||
*/
|
||||
public NormalizationH1(float c) {
|
||||
// unbounded but typical range 0..10 or so
|
||||
if (Float.isFinite(c) == false || c < 0) {
|
||||
throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value");
|
||||
}
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
|
@ -47,8 +51,8 @@ public class NormalizationH1 extends Normalization {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return tf * c * stats.getAvgFieldLength() / len;
|
||||
public final double tfn(BasicStats stats, double tf, double len) {
|
||||
return tf * c * (stats.getAvgFieldLength() / len);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -38,6 +38,10 @@ public class NormalizationH2 extends Normalization {
|
|||
* normalization with respect to the document length.
|
||||
*/
|
||||
public NormalizationH2(float c) {
|
||||
// unbounded but typical range 0..10 or so
|
||||
if (Float.isFinite(c) == false || c < 0) {
|
||||
throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value");
|
||||
}
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
|
@ -49,8 +53,8 @@ public class NormalizationH2 extends Normalization {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
|
||||
public final double tfn(BasicStats stats, double tf, double len) {
|
||||
return tf * log2(1 + c * stats.getAvgFieldLength() / len);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -36,11 +36,14 @@ public class NormalizationH3 extends Normalization {
|
|||
* @param mu smoothing parameter <code>μ</code>
|
||||
*/
|
||||
public NormalizationH3(float mu) {
|
||||
if (Float.isFinite(mu) == false || mu < 0) {
|
||||
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
|
||||
}
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tfn(BasicStats stats, float tf, float len) {
|
||||
public double tfn(BasicStats stats, double tf, double len) {
|
||||
return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,15 +34,18 @@ public class NormalizationZ extends Normalization {
|
|||
/**
|
||||
* Creates NormalizationZ with the supplied parameter <code>z</code>.
|
||||
* @param z represents <code>A/(A+1)</code> where <code>A</code>
|
||||
* measures the specificity of the language.
|
||||
* measures the specificity of the language. It ranges from (0 .. 0.5)
|
||||
*/
|
||||
public NormalizationZ(float z) {
|
||||
if (Float.isNaN(z) || z <= 0f || z >= 0.5f) {
|
||||
throw new IllegalArgumentException("illegal z value: " + z + ", must be in the range (0 .. 0.5)");
|
||||
}
|
||||
this.z = z;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tfn(BasicStats stats, float tf, float len) {
|
||||
return (float)(tf * Math.pow(stats.avgFieldLength / len, z));
|
||||
public double tfn(BasicStats stats, double tf, double len) {
|
||||
return tf * Math.pow(stats.avgFieldLength / len, z);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* A subclass of {@code Similarity} that provides a simplified API for its
|
||||
* descendants. Subclasses are only required to implement the {@link #score}
|
||||
* and {@link #toString()} methods. Implementing
|
||||
* {@link #explain(List, BasicStats, int, float, float)} is optional,
|
||||
* {@link #explain(List, BasicStats, int, double, double)} is optional,
|
||||
* inasmuch as SimilarityBase already provides a basic explanation of the score
|
||||
* and the term frequency. However, implementers of a subclass are encouraged to
|
||||
* include as much detail about the scoring method as possible.
|
||||
|
@ -93,7 +93,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
}
|
||||
|
||||
/** Factory method to return a custom stats object */
|
||||
protected BasicStats newStats(String field, float boost) {
|
||||
protected BasicStats newStats(String field, double boost) {
|
||||
return new BasicStats(field, boost);
|
||||
}
|
||||
|
||||
|
@ -113,7 +113,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
}
|
||||
|
||||
final long numberOfFieldTokens;
|
||||
final float avgFieldLength;
|
||||
final double avgFieldLength;
|
||||
|
||||
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
||||
|
||||
|
@ -145,7 +145,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
* @param docLen the document length.
|
||||
* @return the score.
|
||||
*/
|
||||
protected abstract float score(BasicStats stats, float freq, float docLen);
|
||||
protected abstract double score(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* Subclasses should implement this method to explain the score. {@code expl}
|
||||
|
@ -161,16 +161,16 @@ public abstract class SimilarityBase extends Similarity {
|
|||
* @param docLen the document length.
|
||||
*/
|
||||
protected void explain(
|
||||
List<Explanation> subExpls, BasicStats stats, int doc, float freq, float docLen) {}
|
||||
List<Explanation> subExpls, BasicStats stats, int doc, double freq, double docLen) {}
|
||||
|
||||
/**
|
||||
* Explains the score. The implementation here provides a basic explanation
|
||||
* in the format <em>score(name-of-similarity, doc=doc-id,
|
||||
* freq=term-frequency), computed from:</em>, and
|
||||
* attaches the score (computed via the {@link #score(BasicStats, float, float)}
|
||||
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
|
||||
* method) and the explanation for the term frequency. Subclasses content with
|
||||
* this format may add additional details in
|
||||
* {@link #explain(List, BasicStats, int, float, float)}.
|
||||
* {@link #explain(List, BasicStats, int, double, double)}.
|
||||
*
|
||||
* @param stats the corpus level statistics.
|
||||
* @param doc the document id.
|
||||
|
@ -179,12 +179,12 @@ public abstract class SimilarityBase extends Similarity {
|
|||
* @return the explanation.
|
||||
*/
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, float docLen) {
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
score(stats, freq.getValue(), docLen),
|
||||
(float) score(stats, freq.getValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
|
||||
subs);
|
||||
}
|
||||
|
@ -248,8 +248,8 @@ public abstract class SimilarityBase extends Similarity {
|
|||
|
||||
/** Delegates the {@link #score(int, float)} and
|
||||
* {@link #explain(int, Explanation)} methods to
|
||||
* {@link SimilarityBase#score(BasicStats, float, float)} and
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
|
||||
* {@link SimilarityBase#score(BasicStats, double, double)} and
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, double)},
|
||||
* respectively.
|
||||
*/
|
||||
final class BasicSimScorer extends SimScorer {
|
||||
|
@ -261,9 +261,9 @@ public abstract class SimilarityBase extends Similarity {
|
|||
this.norms = norms;
|
||||
}
|
||||
|
||||
float getLengthValue(int doc) throws IOException {
|
||||
double getLengthValue(int doc) throws IOException {
|
||||
if (norms == null) {
|
||||
return 1F;
|
||||
return 1D;
|
||||
}
|
||||
if (norms.advanceExact(doc)) {
|
||||
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
|
@ -275,7 +275,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
// We have to supply something in case norms are omitted
|
||||
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
|
||||
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -450,7 +450,9 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
final long df = termStats.docFreq();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
|
||||
return Explanation.match(idf, "idf(docFreq, docCount)",
|
||||
Explanation.match(df, "docFreq, number of documents containing term"),
|
||||
Explanation.match(docCount, "docCount, total number of documents with field"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -643,20 +645,37 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
"fieldNorm(doc=" + doc + ")");
|
||||
|
||||
return Explanation.match(
|
||||
tfExplanation.getValue() * stats.idf.getValue() * fieldNormExpl.getValue(),
|
||||
tfExplanation.getValue() * fieldNormExpl.getValue(),
|
||||
"fieldWeight in " + doc + ", product of:",
|
||||
tfExplanation, stats.idf, fieldNormExpl);
|
||||
tfExplanation, fieldNormExpl);
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
Explanation queryExpl = Explanation.match(stats.boost, "boost");
|
||||
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
|
||||
if (stats.boost == 1f) {
|
||||
return fieldExpl;
|
||||
List<Explanation> subs = new ArrayList<Explanation>();
|
||||
if (stats.boost != 1F) {
|
||||
subs.add(Explanation.match(stats.boost, "boost"));
|
||||
}
|
||||
subs.add(stats.idf);
|
||||
Explanation tf = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
|
||||
subs.add(tf);
|
||||
|
||||
float norm;
|
||||
if (norms == null) {
|
||||
norm = 1f;
|
||||
} else if (norms.advanceExact(doc) == false) {
|
||||
norm = 0f;
|
||||
} else {
|
||||
norm = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
}
|
||||
|
||||
Explanation fieldNorm = Explanation.match(
|
||||
norm,
|
||||
"fieldNorm(doc=" + doc + ")");
|
||||
subs.add(fieldNorm);
|
||||
|
||||
return Explanation.match(
|
||||
queryExpl.getValue() * fieldExpl.getValue(),
|
||||
stats.queryWeight * tf.getValue() * norm,
|
||||
"score(doc="+doc+",freq="+freq.getValue()+"), product of:",
|
||||
queryExpl, fieldExpl);
|
||||
subs);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,7 +97,7 @@
|
|||
* The easiest way to quickly implement a new ranking method is to extend
|
||||
* {@link org.apache.lucene.search.similarities.SimilarityBase}, which provides
|
||||
* basic implementations for the low level . Subclasses are only required to
|
||||
* implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)}
|
||||
* implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, double, double)}
|
||||
* and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()}
|
||||
* methods.
|
||||
*
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public abstract class AxiomaticTestCase extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
// axiomatic parameter s
|
||||
final float s;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
s = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
s = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
s = 1;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
s = random.nextFloat();
|
||||
break;
|
||||
}
|
||||
// axiomatic query length
|
||||
final int queryLen;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
queryLen = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
queryLen = 1;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
queryLen = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
queryLen = random.nextInt(Integer.MAX_VALUE);
|
||||
break;
|
||||
}
|
||||
// axiomatic parameter k
|
||||
final float k;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
k = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
k = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
k = 1;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
k = random.nextFloat();
|
||||
break;
|
||||
}
|
||||
|
||||
return getAxiomaticModel(s, queryLen, k);
|
||||
}
|
||||
|
||||
protected abstract Similarity getAxiomaticModel(float s, int queryLen, float k);
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
final AfterEffect afterEffect;
|
||||
switch(random.nextInt(3)) {
|
||||
case 0:
|
||||
afterEffect = new AfterEffect.NoAfterEffect();
|
||||
break;
|
||||
case 1:
|
||||
afterEffect = new AfterEffectL();
|
||||
break;
|
||||
default:
|
||||
afterEffect = new AfterEffectB();
|
||||
break;
|
||||
}
|
||||
// normalization hyper-parameter c
|
||||
final float c;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
c = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
c = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" c values but don't enforce this anywhere.
|
||||
c = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
c = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
// normalization hyper-parameter z
|
||||
final float z;
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
z = Float.MIN_VALUE;
|
||||
break;
|
||||
case 1:
|
||||
// maximum value
|
||||
z = Math.nextDown(0.5f);
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
float zcand = random.nextFloat() / 2;
|
||||
if (zcand == 0f) {
|
||||
// nextFloat returns 0 inclusive, we have to avoid it.
|
||||
z = Math.nextUp(zcand);
|
||||
} else {
|
||||
z = zcand;
|
||||
}
|
||||
}
|
||||
// dirichlet parameter mu
|
||||
final float mu;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
mu = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
mu = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
|
||||
mu = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
mu = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
final Normalization normalization;
|
||||
switch(random.nextInt(5)) {
|
||||
case 0:
|
||||
normalization = new Normalization.NoNormalization();
|
||||
break;
|
||||
case 1:
|
||||
normalization = new NormalizationH1(c);
|
||||
break;
|
||||
case 2:
|
||||
normalization = new NormalizationH2(c);
|
||||
break;
|
||||
case 3:
|
||||
normalization = new NormalizationH3(mu);
|
||||
break;
|
||||
default:
|
||||
normalization = new NormalizationZ(z);
|
||||
break;
|
||||
}
|
||||
return new DFRSimilarity(getBasicModel(), afterEffect, normalization);
|
||||
}
|
||||
|
||||
/** return BasicModel under test */
|
||||
protected abstract BasicModel getBasicModel();
|
||||
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public abstract class DistributionTestCase extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
final Lambda lambda;
|
||||
if (random.nextBoolean()) {
|
||||
lambda = new LambdaDF();
|
||||
} else {
|
||||
lambda = new LambdaTTF();
|
||||
}
|
||||
|
||||
// normalization hyper-parameter c
|
||||
final float c;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
c = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
c = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" c values but don't enforce this anywhere.
|
||||
c = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
c = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
// normalization hyper-parameter z
|
||||
final float z;
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
z = Float.MIN_VALUE;
|
||||
break;
|
||||
case 1:
|
||||
// maximum value
|
||||
z = Math.nextDown(0.5f);
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
float zcand = random.nextFloat() / 2;
|
||||
if (zcand == 0f) {
|
||||
// nextFloat returns 0 inclusive, we have to avoid it.
|
||||
z = Math.nextUp(zcand);
|
||||
} else {
|
||||
z = zcand;
|
||||
}
|
||||
}
|
||||
// dirichlet parameter mu
|
||||
final float mu;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
mu = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
mu = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
|
||||
mu = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
mu = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
final Normalization normalization;
|
||||
switch(random.nextInt(5)) {
|
||||
case 0:
|
||||
normalization = new Normalization.NoNormalization();
|
||||
break;
|
||||
case 1:
|
||||
normalization = new NormalizationH1(c);
|
||||
break;
|
||||
case 2:
|
||||
normalization = new NormalizationH2(c);
|
||||
break;
|
||||
case 3:
|
||||
normalization = new NormalizationH3(mu);
|
||||
break;
|
||||
default:
|
||||
normalization = new NormalizationZ(z);
|
||||
break;
|
||||
}
|
||||
return new IBSimilarity(getDistribution(), lambda, normalization);
|
||||
}
|
||||
|
||||
/** return BasicModel under test */
|
||||
protected abstract Distribution getDistribution();
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns NaN scores for sloppy freqs < 1 (due to log without floor)
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestAxiomaticF1EXP extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
return new AxiomaticF1EXP(s, k);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns NaN scores for sloppy freqs < 1 (due to log without floor)
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestAxiomaticF1LOG extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
return new AxiomaticF1LOG(s);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestAxiomaticF2EXP extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
return new AxiomaticF2EXP(s, k);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestAxiomaticF2LOG extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
return new AxiomaticF2LOG(s);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns negative scores at least, but it (now) warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestAxiomaticF3EXP extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
// TODO: use the randomized parameters and not these hardcoded ones
|
||||
return new AxiomaticF3EXP(0.25f, 1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns negative scores at least, but it (now) warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestAxiomaticF3LOG extends AxiomaticTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
|
||||
// TODO: use the randomized parameters and not these hardcoded ones
|
||||
return new AxiomaticF3LOG(0.25f, 1);
|
||||
}
|
||||
|
||||
}
|
|
@ -17,10 +17,9 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import java.util.Random;
|
||||
|
||||
public class TestBM25Similarity extends LuceneTestCase {
|
||||
public class TestBM25Similarity extends BaseSimilarityTestCase {
|
||||
|
||||
public void testIllegalK1() {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
|
@ -61,17 +60,51 @@ public class TestBM25Similarity extends LuceneTestCase {
|
|||
assertTrue(expected.getMessage().contains("illegal b value"));
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().equals(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
@Override
|
||||
protected Similarity getSimilarity(Random random) {
|
||||
// term frequency normalization parameter k1
|
||||
final float k1;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
k1 = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
k1 = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// upper bounds on individual term's score is 43.262806 * (k1 + 1) * boost
|
||||
// we just limit the test to "reasonable" k1 values but don't enforce this anywhere.
|
||||
k1 = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
k1 = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
return null;
|
||||
|
||||
// length normalization parameter b [0 .. 1]
|
||||
final float b;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
b = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
b = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
b = 1;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
b = random.nextFloat();
|
||||
break;
|
||||
}
|
||||
return new BM25Similarity(k1, b);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns negative scores at least, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelBE extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelBE();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// scores go backwards with respect to TF, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelD extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelD();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestBasicModelG extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelG();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestBasicModelIF extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelIF();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestBasicModelIn extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelIn();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestBasicModelIne extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelIne();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
//scores go backwards with respect to TF, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelP extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelP();
|
||||
}
|
||||
|
||||
}
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
|
@ -32,11 +33,10 @@ import org.apache.lucene.search.PhraseQuery;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestBooleanSimilarity extends LuceneTestCase {
|
||||
public class TestBooleanSimilarity extends BaseSimilarityTestCase {
|
||||
|
||||
public void testTermScoreIsEqualToBoost() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
|
@ -114,4 +114,9 @@ public class TestBooleanSimilarity extends LuceneTestCase {
|
|||
0f);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Similarity getSimilarity(Random random) {
|
||||
return new BooleanSimilarity();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
|
@ -39,11 +40,10 @@ import org.apache.lucene.search.TopDocs;
|
|||
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestClassicSimilarity extends LuceneTestCase {
|
||||
public class TestClassicSimilarity extends BaseSimilarityTestCase {
|
||||
private Directory directory;
|
||||
private IndexReader indexReader;
|
||||
private IndexSearcher indexSearcher;
|
||||
|
@ -185,4 +185,9 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
0f);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Similarity getSimilarity(Random random) {
|
||||
return new ClassicSimilarity();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
public class TestDistributionLL extends DistributionTestCase {
|
||||
|
||||
@Override
|
||||
protected Distribution getDistribution() {
|
||||
return new DistributionLL();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// scores go infinite, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestDistributionSPL extends DistributionTestCase {
|
||||
|
||||
@Override
|
||||
protected Distribution getDistribution() {
|
||||
return new DistributionSPL();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestIndependenceChiSquared extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
return new DFISimilarity(new IndependenceChiSquared());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestIndependenceSaturated extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
return new DFISimilarity(new IndependenceSaturated());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestIndependenceStandardized extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
return new DFISimilarity(new IndependenceStandardized());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestLMDirichletSimilarity extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
// smoothing parameter mu, unbounded
|
||||
final float mu;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
mu = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
mu = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
|
||||
mu = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
mu = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
return new LMDirichletSimilarity(mu);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestLMJelinekMercerSimilarity extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
// smoothing parameter lambda: (0..1]
|
||||
final float lambda;
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// tiny value
|
||||
lambda = Float.MIN_VALUE;
|
||||
break;
|
||||
case 1:
|
||||
// maximum value
|
||||
lambda = 1;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
lambda = random.nextFloat();
|
||||
break;
|
||||
}
|
||||
return new LMJelinekMercerSimilarity(lambda);
|
||||
}
|
||||
|
||||
}
|
|
@ -51,7 +51,7 @@ import org.apache.lucene.util.Version;
|
|||
* items in the list. If a test case fails, the name of the Similarity that
|
||||
* caused the failure is returned as part of the assertion error message.</p>
|
||||
* <p>Unit testing is performed by constructing statistics manually and calling
|
||||
* the {@link SimilarityBase#score(BasicStats, float, float)} method of the
|
||||
* the {@link SimilarityBase#score(BasicStats, double, double)} method of the
|
||||
* Similarities. The statistics represent corner cases of corpus distributions.
|
||||
* </p>
|
||||
* <p>For the integration tests, a small (8-document) collection is indexed. The
|
||||
|
@ -191,17 +191,17 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
}
|
||||
/**
|
||||
* The generic test core called by all unit test methods. It calls the
|
||||
* {@link SimilarityBase#score(BasicStats, float, float)} method of all
|
||||
* {@link SimilarityBase#score(BasicStats, double, double)} method of all
|
||||
* Similarities in {@link #sims} and checks if the score is valid; i.e. it
|
||||
* is a finite positive real number.
|
||||
*/
|
||||
private void unitTestCore(BasicStats stats, float freq, int docLen) {
|
||||
for (SimilarityBase sim : sims) {
|
||||
BasicStats realStats = (BasicStats) sim.computeWeight(
|
||||
stats.getBoost(),
|
||||
(float)stats.getBoost(),
|
||||
toCollectionStats(stats),
|
||||
toTermStats(stats));
|
||||
float score = sim.score(realStats, freq, docLen);
|
||||
float score = (float)sim.score(realStats, freq, docLen);
|
||||
float explScore = sim.explain(
|
||||
realStats, 1, Explanation.match(freq, "freq"), docLen).getValue();
|
||||
assertFalse("Score infinite: " + sim.toString(), Float.isInfinite(score));
|
||||
|
@ -524,17 +524,17 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
|
||||
/**
|
||||
* The generic test core called by all correctness test methods. It calls the
|
||||
* {@link SimilarityBase#score(BasicStats, float, float)} method of all
|
||||
* {@link SimilarityBase#score(BasicStats, double, double)} method of all
|
||||
* Similarities in {@link #sims} and compares the score against the manually
|
||||
* computed {@code gold}.
|
||||
*/
|
||||
private void correctnessTestCore(SimilarityBase sim, float gold) {
|
||||
BasicStats stats = createStats();
|
||||
BasicStats realStats = (BasicStats) sim.computeWeight(
|
||||
stats.getBoost(),
|
||||
(float)stats.getBoost(),
|
||||
toCollectionStats(stats),
|
||||
toTermStats(stats));
|
||||
float score = sim.score(realStats, FREQ, DOC_LEN);
|
||||
float score = (float) sim.score(realStats, FREQ, DOC_LEN);
|
||||
assertEquals(
|
||||
sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON);
|
||||
}
|
||||
|
|
|
@ -1484,7 +1484,7 @@ public class TestBlockJoin extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
return freq;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
|||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.Random;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
@ -318,6 +319,8 @@ public class CheckHits {
|
|||
public static float explainToleranceDelta(float f1, float f2) {
|
||||
return Math.max(EXPLAIN_SCORE_TOLERANCE_MINIMUM, Math.max(Math.abs(f1), Math.abs(f2)) * EXPLAIN_SCORE_TOLERANCE_DELTA);
|
||||
}
|
||||
|
||||
private static final Pattern COMPUTED_FROM_PATTERN = Pattern.compile(".*, computed as .* from:");
|
||||
|
||||
/**
|
||||
* Assert that an explanation has the expected score, and optionally that its
|
||||
|
@ -335,9 +338,12 @@ public class CheckHits {
|
|||
boolean deep,
|
||||
Explanation expl) {
|
||||
float value = expl.getValue();
|
||||
Assert.assertEquals(q+": score(doc="+doc+")="+score+
|
||||
" != explanationScore="+value+" Explanation: "+expl,
|
||||
score,value,explainToleranceDelta(score, value));
|
||||
// TODO: clean this up if we use junit 5 (the assert message is costly)
|
||||
try {
|
||||
Assert.assertEquals(score, value, explainToleranceDelta(score, value));
|
||||
} catch (Exception e) {
|
||||
Assert.fail(q+": score(doc="+doc+")="+score+" != explanationScore="+value+" Explanation: "+expl);
|
||||
}
|
||||
|
||||
if (!deep) return;
|
||||
|
||||
|
@ -368,7 +374,7 @@ public class CheckHits {
|
|||
boolean productOf = descr.endsWith("product of:");
|
||||
boolean sumOf = descr.endsWith("sum of:");
|
||||
boolean maxOf = descr.endsWith("max of:");
|
||||
boolean computedOf = descr.matches(".*, computed as .* from:");
|
||||
boolean computedOf = descr.indexOf("computed as") > 0 && COMPUTED_FROM_PATTERN.matcher(descr).matches();
|
||||
boolean maxTimesOthers = false;
|
||||
if (!(productOf || sumOf || maxOf || computedOf)) {
|
||||
// maybe 'max plus x times others'
|
||||
|
@ -386,11 +392,12 @@ public class CheckHits {
|
|||
}
|
||||
}
|
||||
// TODO: this is a TERRIBLE assertion!!!!
|
||||
Assert.assertTrue(
|
||||
q+": multi valued explanation description=\""+descr
|
||||
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
|
||||
+" or 'sum of:' or 'max of:' - "+expl,
|
||||
productOf || sumOf || maxOf || computedOf || maxTimesOthers);
|
||||
if (false == (productOf || sumOf || maxOf || computedOf || maxTimesOthers)) {
|
||||
Assert.fail(
|
||||
q+": multi valued explanation description=\""+descr
|
||||
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
|
||||
+" or 'sum of:' or 'max of:' - "+expl);
|
||||
}
|
||||
float sum = 0;
|
||||
float product = 1;
|
||||
float max = 0;
|
||||
|
@ -414,9 +421,13 @@ public class CheckHits {
|
|||
Assert.assertTrue("should never get here!", computedOf);
|
||||
combined = value;
|
||||
}
|
||||
Assert.assertEquals(q+": actual subDetails combined=="+combined+
|
||||
" != value="+value+" Explanation: "+expl,
|
||||
combined,value,explainToleranceDelta(combined, value));
|
||||
// TODO: clean this up if we use junit 5 (the assert message is costly)
|
||||
try {
|
||||
Assert.assertEquals(combined, value, explainToleranceDelta(combined, value));
|
||||
} catch (Exception e) {
|
||||
Assert.fail(q+": actual subDetails combined=="+combined+
|
||||
" != value="+value+" Explanation: "+expl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,473 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.CheckHits;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimWeight;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Abstract class to do basic tests for a similarity.
|
||||
* NOTE: This test focuses on the similarity impl, nothing else.
|
||||
* The [stretch] goal is for this test to be
|
||||
* so thorough in testing a new Similarity that if this
|
||||
* test passes, then all Lucene/Solr tests should also pass. Ie,
|
||||
* if there is some bug in a given Similarity that this
|
||||
* test fails to catch then this test needs to be improved! */
|
||||
public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
||||
|
||||
static LeafReader WITHOUT_NORM;
|
||||
static Directory WITHOUT_NORM_DIR;
|
||||
|
||||
static LeafReader WITH_NORM_BASE;
|
||||
static Directory WITH_NORM_DIR;
|
||||
static List<LeafReader> NORM_VALUES;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
// without norms
|
||||
WITHOUT_NORM_DIR = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), WITHOUT_NORM_DIR);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "value", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
WITHOUT_NORM = getOnlyLeafReader(writer.getReader());
|
||||
writer.close();
|
||||
|
||||
// with norms
|
||||
WITH_NORM_DIR = newDirectory();
|
||||
writer = new RandomIndexWriter(random(), WITH_NORM_DIR);
|
||||
doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
fieldType.setOmitNorms(true);
|
||||
doc.add(newField("field", "value", fieldType));
|
||||
writer.addDocument(doc);
|
||||
WITH_NORM_BASE = getOnlyLeafReader(writer.getReader());
|
||||
writer.close();
|
||||
|
||||
// all possible norm values for the doc
|
||||
NORM_VALUES = new ArrayList<>();
|
||||
NORM_VALUES.add(WITHOUT_NORM);
|
||||
for (int i = 1; i < 256; i++) {
|
||||
final long value = i;
|
||||
NORM_VALUES.add(new FilterLeafReader(WITH_NORM_BASE) {
|
||||
@Override
|
||||
public CacheHelper getCoreCacheHelper() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CacheHelper getReaderCacheHelper() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNormValues(String field) throws IOException {
|
||||
if (field.equals("field")) {
|
||||
return new CannedNorm(value);
|
||||
} else {
|
||||
return super.getNormValues(field);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
IOUtils.close(WITH_NORM_BASE, WITH_NORM_DIR, WITHOUT_NORM, WITHOUT_NORM_DIR);
|
||||
WITH_NORM_BASE = WITHOUT_NORM = null;
|
||||
WITH_NORM_DIR = WITHOUT_NORM_DIR = null;
|
||||
NORM_VALUES = null;
|
||||
}
|
||||
|
||||
/** 1-document norms impl of the given value */
|
||||
static class CannedNorm extends NumericDocValues {
|
||||
int docID = -1;
|
||||
final long value;
|
||||
|
||||
CannedNorm(long value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long longValue() throws IOException {
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean advanceExact(int target) throws IOException {
|
||||
assert target == 0;
|
||||
docID = target;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (docID == -1) {
|
||||
return docID = 0;
|
||||
} else {
|
||||
return docID = NO_MORE_DOCS;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (target == 0) {
|
||||
return docID = 0;
|
||||
} else {
|
||||
return docID = NO_MORE_DOCS;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new similarity with all parameters randomized within valid ranges.
|
||||
*/
|
||||
protected abstract Similarity getSimilarity(Random random);
|
||||
|
||||
static final long MAXDOC_FORTESTING = 1L << 48;
|
||||
// must be at least MAXDOC_FORTESTING + Integer.MAX_VALUE
|
||||
static final long MAXTOKENS_FORTESTING = 1L << 49;
|
||||
|
||||
/**
|
||||
* returns a random corpus that is at least possible given
|
||||
* the norm value for a single document.
|
||||
*/
|
||||
static CollectionStatistics newCorpus(Random random, int norm) {
|
||||
// lower bound of tokens in the collection (you produced this norm somehow)
|
||||
final int lowerBound;
|
||||
if (norm == 0) {
|
||||
// norms are omitted, but there must have been at least one token to produce that norm
|
||||
lowerBound = 1;
|
||||
} else {
|
||||
// minimum value that would decode to such a norm
|
||||
lowerBound = SmallFloat.byte4ToInt((byte) norm);
|
||||
}
|
||||
final long maxDoc;
|
||||
if (random.nextBoolean()) {
|
||||
// small collection
|
||||
maxDoc = TestUtil.nextLong(random, 1, 100000);
|
||||
} else {
|
||||
// yuge collection
|
||||
maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
|
||||
}
|
||||
// TODO: make this a mandatory statistic, or test it with -1
|
||||
final long docCount;
|
||||
if (random.nextBoolean()) {
|
||||
// sparse field
|
||||
docCount = TestUtil.nextLong(random, 1, maxDoc);
|
||||
} else {
|
||||
// fully populated
|
||||
docCount = maxDoc;
|
||||
}
|
||||
// random docsize: but can't require docs to have > 2B tokens
|
||||
long upperBound;
|
||||
try {
|
||||
upperBound = Math.min(MAXTOKENS_FORTESTING, Math.multiplyExact(docCount, Integer.MAX_VALUE));
|
||||
} catch (ArithmeticException overflow) {
|
||||
upperBound = MAXTOKENS_FORTESTING;
|
||||
}
|
||||
// TODO: make this a mandatory statistic, or test it with -1
|
||||
final long sumDocFreq;
|
||||
if (random.nextBoolean()) {
|
||||
// shortest possible docs
|
||||
sumDocFreq = docCount;
|
||||
} else {
|
||||
// random docsize
|
||||
sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
|
||||
}
|
||||
final long sumTotalTermFreq;
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// unsupported (e.g. omitTF)
|
||||
sumTotalTermFreq = -1;
|
||||
break;
|
||||
case 1:
|
||||
// no repetition of terms (except to satisfy this norm)
|
||||
sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
|
||||
break;
|
||||
default:
|
||||
// random repetition
|
||||
assert sumDocFreq - 1 + lowerBound <= upperBound;
|
||||
sumTotalTermFreq = TestUtil.nextLong(random, sumDocFreq - 1 + lowerBound, upperBound);
|
||||
break;
|
||||
}
|
||||
return new CollectionStatistics("field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
|
||||
}
|
||||
|
||||
private static final BytesRef TERM = new BytesRef("term");
|
||||
|
||||
/**
|
||||
* returns new random term, that fits within the bounds of the corpus
|
||||
*/
|
||||
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
|
||||
final long docFreq;
|
||||
if (random.nextBoolean()) {
|
||||
// rare term
|
||||
docFreq = 1;
|
||||
} else {
|
||||
// random specificity
|
||||
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
|
||||
}
|
||||
final long totalTermFreq;
|
||||
if (corpus.sumTotalTermFreq() == -1) {
|
||||
// omitTF
|
||||
totalTermFreq = -1;
|
||||
} else if (random.nextBoolean()) {
|
||||
// no repetition
|
||||
totalTermFreq = docFreq;
|
||||
} else {
|
||||
// random repetition: but can't require docs to have > 2B tokens
|
||||
long upperBound;
|
||||
try {
|
||||
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
|
||||
} catch (ArithmeticException overflow) {
|
||||
upperBound = corpus.sumTotalTermFreq();
|
||||
}
|
||||
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
|
||||
}
|
||||
return new TermStatistics(TERM, docFreq, totalTermFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests scoring across a bunch of random terms/corpora/frequencies for each possible document length.
|
||||
* It does the following checks:
|
||||
* <ul>
|
||||
* <li>scores are non-negative and finite.
|
||||
* <li>score matches the explanation exactly.
|
||||
* <li>internal explanations calculations are sane (e.g. sum of: and so on actually compute sums)
|
||||
* <li>scores don't decrease as term frequencies increase: e.g. score(freq=N + 1) >= score(freq=N)
|
||||
* <li>scores don't decrease as documents get shorter, e.g. score(len=M) >= score(len=M+1)
|
||||
* <li>scores don't decrease as terms get rarer, e.g. score(term=N) >= score(term=N+1)
|
||||
* <li>scoring works for floating point frequencies (e.g. sloppy phrase and span queries will work)
|
||||
* <li>scoring works for reasonably large 64-bit statistic values (e.g. distributed search will work)
|
||||
* <li>scoring works for reasonably large boost values (0 .. Integer.MAX_VALUE, e.g. query boosts will work)
|
||||
* <li>scoring works for parameters randomized within valid ranges (see {@link #getSimilarity(Random)})
|
||||
* </ul>
|
||||
*/
|
||||
public void testRandomScoring() throws Exception {
|
||||
Random random = random();
|
||||
final int iterations = atLeast(10);
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
// pull a new similarity to switch up parameters
|
||||
Similarity similarity = getSimilarity(random);
|
||||
for (int j = 0; j < 10; j++) {
|
||||
// for each norm value...
|
||||
for (int k = 0; k < NORM_VALUES.size(); k++) {
|
||||
CollectionStatistics corpus = newCorpus(random, k);
|
||||
for (int l = 0; l < 10; l++) {
|
||||
TermStatistics term = newTerm(random, corpus);
|
||||
final float freq;
|
||||
if (term.totalTermFreq() == -1) {
|
||||
// omit TF
|
||||
freq = 1;
|
||||
} else if (term.docFreq() == 1) {
|
||||
// only one document, all the instances must be here.
|
||||
freq = Math.toIntExact(term.totalTermFreq());
|
||||
} else {
|
||||
// there is at least one other document, and those must have at least 1 instance each.
|
||||
int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
|
||||
if (random.nextBoolean()) {
|
||||
freq = TestUtil.nextInt(random, 1, upperBound);
|
||||
} else {
|
||||
float freqCandidate = upperBound * random.nextFloat();
|
||||
// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
|
||||
// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
|
||||
if (freqCandidate <= Float.MIN_VALUE) {
|
||||
freqCandidate = Math.nextUp(Float.MIN_VALUE);
|
||||
}
|
||||
freq = freqCandidate;
|
||||
}
|
||||
}
|
||||
// we just limit the test to "reasonable" boost values but don't enforce this anywhere.
|
||||
// too big, and you are asking for overflow. that's hard for a sim to enforce (but definitely possible)
|
||||
// for now, we just want to detect overflow where its a real bug/hazard in the computation with reasonable inputs.
|
||||
final float boost;
|
||||
switch (random.nextInt(5)) {
|
||||
case 0:
|
||||
// minimum value (not enforced)
|
||||
boost = 0F;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
boost = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// no-op value (sometimes treated special in explanations)
|
||||
boost = 1F;
|
||||
break;
|
||||
case 3:
|
||||
// maximum value (not enforceD)
|
||||
boost = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
boost = random.nextFloat() * Integer.MAX_VALUE;
|
||||
break;
|
||||
}
|
||||
doTestScoring(similarity, corpus, term, boost, freq, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** runs for a single test case, so that if you hit a test failure you can write a reproducer just for that scenario */
|
||||
private static void doTestScoring(Similarity similarity, CollectionStatistics corpus, TermStatistics term, float boost, float freq, int norm) throws IOException {
|
||||
boolean success = false;
|
||||
SimWeight weight = similarity.computeWeight(boost, corpus, term);
|
||||
SimScorer scorer = similarity.simScorer(weight, NORM_VALUES.get(norm).getContext());
|
||||
try {
|
||||
float score = scorer.score(0, freq);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue("infinite/NaN score: " + score, Float.isFinite(score));
|
||||
assertTrue("negative score: " + score, score >= 0);
|
||||
// check explanation matches
|
||||
Explanation explanation = scorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
|
||||
if (score != explanation.getValue()) {
|
||||
fail("expected: " + score + ", got: " + explanation);
|
||||
}
|
||||
CheckHits.verifyExplanation("<test query>", 0, score, true, explanation);
|
||||
|
||||
// check score(freq-1), given the same norm it should be <= score(freq) [scores non-decreasing for more term occurrences]
|
||||
final float prevFreq;
|
||||
if (random().nextBoolean() && freq == (int)freq && freq > 1 && term.docFreq() > 1) {
|
||||
// previous in integer space
|
||||
prevFreq = freq - 1;
|
||||
} else {
|
||||
// previous in float space (e.g. for sloppyPhrase)
|
||||
prevFreq = Math.nextDown(freq);
|
||||
}
|
||||
|
||||
float prevScore = scorer.score(0, prevFreq);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevScore));
|
||||
assertTrue(prevScore >= 0);
|
||||
// check explanation matches
|
||||
Explanation prevExplanation = scorer.explain(0, Explanation.match(prevFreq, "freq, occurrences of term within document"));
|
||||
if (prevScore != prevExplanation.getValue()) {
|
||||
fail("expected: " + prevScore + ", got: " + prevExplanation);
|
||||
}
|
||||
CheckHits.verifyExplanation("test query (prevFreq)", 0, prevScore, true, prevExplanation);
|
||||
|
||||
if (prevScore > score) {
|
||||
System.out.println(prevExplanation);
|
||||
System.out.println(explanation);
|
||||
fail("score(" + prevFreq + ")=" + prevScore + " > score(" + freq + ")=" + score);
|
||||
}
|
||||
|
||||
// check score(norm-1), given the same freq it should be >= score(norm) [scores non-decreasing as docs get shorter]
|
||||
if (norm > 1) {
|
||||
SimScorer prevNormScorer = similarity.simScorer(weight, NORM_VALUES.get(norm - 1).getContext());
|
||||
float prevNormScore = prevNormScorer.score(0, freq);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevNormScore));
|
||||
assertTrue(prevNormScore >= 0);
|
||||
// check explanation matches
|
||||
Explanation prevNormExplanation = prevNormScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
|
||||
if (prevNormScore != prevNormExplanation.getValue()) {
|
||||
fail("expected: " + prevNormScore + ", got: " + prevNormExplanation);
|
||||
}
|
||||
CheckHits.verifyExplanation("test query (prevNorm)", 0, prevNormScore, true, prevNormExplanation);
|
||||
|
||||
if (prevNormScore < score) {
|
||||
System.out.println(prevNormExplanation);
|
||||
System.out.println(explanation);
|
||||
fail("score(" + freq + "," + (norm-1) + ")=" + prevNormScore + " < score(" + freq + "," + norm + ")=" + score);
|
||||
}
|
||||
}
|
||||
|
||||
// check score(term-1), given the same freq/norm it should be >= score(term) [scores non-decreasing as terms get rarer]
|
||||
if (term.docFreq() > 1 && (term.totalTermFreq() == -1 || freq < term.totalTermFreq())) {
|
||||
final long prevTotalTermFreq;
|
||||
if (term.totalTermFreq() == -1) {
|
||||
prevTotalTermFreq = -1;
|
||||
} else {
|
||||
prevTotalTermFreq = term.totalTermFreq() - 1;
|
||||
}
|
||||
TermStatistics prevTerm = new TermStatistics(term.term(), term.docFreq() - 1, prevTotalTermFreq);
|
||||
SimWeight prevWeight = similarity.computeWeight(boost, corpus, term);
|
||||
SimScorer prevTermScorer = similarity.simScorer(prevWeight, NORM_VALUES.get(norm).getContext());
|
||||
float prevTermScore = prevTermScorer.score(0, freq);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevTermScore));
|
||||
assertTrue(prevTermScore >= 0);
|
||||
// check explanation matches
|
||||
Explanation prevTermExplanation = prevTermScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
|
||||
if (prevTermScore != prevTermExplanation.getValue()) {
|
||||
fail("expected: " + prevTermScore + ", got: " + prevTermExplanation);
|
||||
}
|
||||
CheckHits.verifyExplanation("test query (prevTerm)", 0, prevTermScore, true, prevTermExplanation);
|
||||
|
||||
if (prevTermScore < score) {
|
||||
System.out.println(prevTermExplanation);
|
||||
System.out.println(explanation);
|
||||
fail("score(" + freq + "," + (prevTerm) + ")=" + prevTermScore + " < score(" + freq + "," + term + ")=" + score);
|
||||
}
|
||||
}
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
System.out.println(similarity);
|
||||
System.out.println(corpus);
|
||||
System.out.println(term);
|
||||
if (norm == 0) {
|
||||
System.out.println("norms=omitted");
|
||||
} else {
|
||||
System.out.println("norm=" + norm + " (doc length ~ " + SmallFloat.byte4ToInt((byte) norm) + ")");
|
||||
}
|
||||
System.out.println("freq=" + freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -434,7 +434,7 @@ public final class TestUtil {
|
|||
|
||||
/** start and end are BOTH inclusive */
|
||||
public static long nextLong(Random r, long start, long end) {
|
||||
assert end >= start;
|
||||
assert end >= start : "start=" + start + ",end=" + end;
|
||||
final BigInteger range = BigInteger.valueOf(end).add(BigInteger.valueOf(1)).subtract(BigInteger.valueOf(start));
|
||||
if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) <= 0) {
|
||||
return start + r.nextInt(range.intValue());
|
||||
|
|
Loading…
Reference in New Issue