LUCENE-7997: More sanity testing of similarities

This commit is contained in:
Robert Muir 2017-10-24 22:48:04 -04:00
parent 81a4f7cc9c
commit 42717d5f4b
78 changed files with 1862 additions and 304 deletions

View File

@ -11,7 +11,12 @@ Changes in Runtime Behavior
will now fail to open even if they have been merged with the previous major
version. (Adrien Grand)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
SimilarityBase switches to 64-bit doubles internally to help avoid common numeric issues.
Add missing range checks for similarity parameters.
Improve BM25 and ClassicSimilarity's explanations. (Robert Muir)
======================= Lucene 7.2.0 =======================

View File

@ -73,4 +73,22 @@ public class CollectionStatistics {
public final long sumDocFreq() {
return sumDocFreq;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("field=");
sb.append('"');
sb.append(field());
sb.append('"');
sb.append(",maxDoc=");
sb.append(maxDoc());
sb.append(",docCount=");
sb.append(docCount());
sb.append(",sumTotalTermFreq=");
sb.append(sumTotalTermFreq());
sb.append(",sumDocFreq=");
sb.append(sumDocFreq);
return sb.toString();
}
}

View File

@ -142,7 +142,7 @@ public class TermQuery extends Query {
if (newDoc == doc) {
float freq = scorer.freq();
SimScorer docScorer = similarity.simScorer(stats, context);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document");
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
scoreExplanation.getValue(),

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum; // javadocs
import org.apache.lucene.util.BytesRef;
/**
@ -52,4 +53,18 @@ public class TermStatistics {
public final long totalTermFreq() {
return totalTermFreq;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("term=");
sb.append('"');
sb.append(Term.toString(term()));
sb.append('"');
sb.append(",docFreq=");
sb.append(docFreq());
sb.append(",totalTermFreq=");
sb.append(totalTermFreq());
return sb.toString();
}
}

View File

@ -38,10 +38,10 @@ public abstract class AfterEffect {
public AfterEffect() {}
/** Returns the aftereffect score. */
public abstract float score(BasicStats stats, float tfn);
public abstract double score(BasicStats stats, double tfn);
/** Returns an explanation for the score. */
public abstract Explanation explain(BasicStats stats, float tfn);
public abstract Explanation explain(BasicStats stats, double tfn);
/** Implementation used when there is no aftereffect. */
public static final class NoAfterEffect extends AfterEffect {
@ -50,12 +50,12 @@ public abstract class AfterEffect {
public NoAfterEffect() {}
@Override
public final float score(BasicStats stats, float tfn) {
return 1f;
public double score(BasicStats stats, double tfn) {
return 1.0;
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
public Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(1, "no aftereffect");
}

View File

@ -29,18 +29,18 @@ public class AfterEffectB extends AfterEffect {
public AfterEffectB() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
long F = stats.getTotalTermFreq()+1;
long n = stats.getDocFreq()+1;
return (F + 1) / (n * (tfn + 1));
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
score(stats, tfn),
(float) score(stats, tfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(tfn, "tfn"),
Explanation.match((float) tfn, "tfn"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
Explanation.match(stats.getDocFreq(), "docFreq"));
}

View File

@ -29,16 +29,16 @@ public class AfterEffectL extends AfterEffect {
public AfterEffectL() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
return 1 / (tfn + 1);
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
score(stats, tfn),
(float) score(stats, tfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(tfn, "tfn"));
Explanation.match((float) tfn, "tfn"));
}
@Override

View File

@ -100,7 +100,7 @@ public abstract class Axiomatic extends SimilarityBase {
}
@Override
public float score(BasicStats stats, float freq, float docLen) {
public double score(BasicStats stats, double freq, double docLen) {
return tf(stats, freq, docLen)
* ln(stats, freq, docLen)
* tfln(stats, freq, docLen)
@ -110,19 +110,19 @@ public abstract class Axiomatic extends SimilarityBase {
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
float freq, float docLen) {
if (stats.getBoost() != 1.0f) {
subs.add(Explanation.match(stats.getBoost(), "boost"));
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
}
subs.add(Explanation.match(this.k, "k"));
subs.add(Explanation.match(this.s, "s"));
subs.add(Explanation.match(this.queryLen, "queryLen"));
subs.add(Explanation.match(tf(stats, freq, docLen), "tf"));
subs.add(Explanation.match(ln(stats, freq, docLen), "ln"));
subs.add(Explanation.match(tfln(stats, freq, docLen), "tfln"));
subs.add(Explanation.match(idf(stats, freq, docLen), "idf"));
subs.add(Explanation.match(gamma(stats, freq, docLen), "gamma"));
subs.add(Explanation.match((float) tf(stats, freq, docLen), "tf"));
subs.add(Explanation.match((float) ln(stats, freq, docLen), "ln"));
subs.add(Explanation.match((float) tfln(stats, freq, docLen), "tfln"));
subs.add(Explanation.match((float) idf(stats, freq, docLen), "idf"));
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
super.explain(subs, stats, doc, freq, docLen);
}
@ -135,25 +135,25 @@ public abstract class Axiomatic extends SimilarityBase {
/**
* compute the term frequency component
*/
protected abstract float tf(BasicStats stats, float freq, float docLen);
protected abstract double tf(BasicStats stats, double freq, double docLen);
/**
* compute the document length component
*/
protected abstract float ln(BasicStats stats, float freq, float docLen);
protected abstract double ln(BasicStats stats, double freq, double docLen);
/**
* compute the mixed term frequency and document length component
*/
protected abstract float tfln(BasicStats stats, float freq, float docLen);
protected abstract double tfln(BasicStats stats, double freq, double docLen);
/**
* compute the inverted document frequency component
*/
protected abstract float idf(BasicStats stats, float freq, float docLen);
protected abstract double idf(BasicStats stats, double freq, double docLen);
/**
* compute the gamma component (only for F3EXp and F3LOG)
*/
protected abstract float gamma(BasicStats stats, float freq, float docLen);
protected abstract double gamma(BasicStats stats, double freq, double docLen);
}

View File

@ -56,16 +56,16 @@ public class AxiomaticF1EXP extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
if (freq <= 0.0) return 0f;
return (float) (1 + Math.log(1 + Math.log(freq)));
protected double tf(BasicStats stats, double freq, double docLen) {
if (freq <= 0.0) return 0.0;
return 1 + Math.log(1 + Math.log(freq));
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
protected double ln(BasicStats stats, double freq, double docLen) {
return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s);
}
@ -73,23 +73,23 @@ public class AxiomaticF1EXP extends Axiomatic {
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tfln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
return 0f;
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
}

View File

@ -49,16 +49,16 @@ public class AxiomaticF1LOG extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
if (freq <= 0.0) return 0f;
return (float) (1 + Math.log(1 + Math.log(freq)));
protected double tf(BasicStats stats, double freq, double docLen) {
if (freq <= 0.0) return 0.0;
return 1 + Math.log(1 + Math.log(freq));
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
protected double ln(BasicStats stats, double freq, double docLen) {
return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s);
}
@ -66,23 +66,23 @@ public class AxiomaticF1LOG extends Axiomatic {
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tfln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
return 0f;
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
}

View File

@ -56,23 +56,23 @@ public class AxiomaticF2EXP extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tf(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double ln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
protected double tfln(BasicStats stats, double freq, double docLen) {
return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength());
}
@ -80,15 +80,15 @@ public class AxiomaticF2EXP extends Axiomatic {
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
return 0f;
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
}

View File

@ -48,23 +48,23 @@ public class AxiomaticF2LOG extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tf(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double ln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
protected double tfln(BasicStats stats, double freq, double docLen) {
return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength());
}
@ -72,15 +72,15 @@ public class AxiomaticF2LOG extends Axiomatic {
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
return 0f;
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
}

View File

@ -17,10 +17,10 @@
package org.apache.lucene.search.similarities;
/**
* F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq
* gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl
*
* NOTE: the gamma function of this similarity creates negative scores
* @lucene.experimental
*/
public class AxiomaticF3EXP extends Axiomatic {
@ -55,40 +55,40 @@ public class AxiomaticF3EXP extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
if (freq <= 0.0) return 0f;
return (float) (1 + Math.log(1 + Math.log(freq)));
protected double tf(BasicStats stats, double freq, double docLen) {
if (freq <= 0.0) return 0.0;
return 1 + Math.log(1 + Math.log(freq));
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double ln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tfln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k);
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
protected double gamma(BasicStats stats, double freq, double docLen) {
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
}
}

View File

@ -17,10 +17,10 @@
package org.apache.lucene.search.similarities;
/**
* F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq
* gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl
*
* NOTE: the gamma function of this similarity creates negative scores
* @lucene.experimental
*/
public class AxiomaticF3LOG extends Axiomatic {
@ -44,40 +44,40 @@ public class AxiomaticF3LOG extends Axiomatic {
* compute the term frequency component
*/
@Override
protected float tf(BasicStats stats, float freq, float docLen) {
if (freq <= 0.0) return 0f;
return (float) (1 + Math.log(1 + Math.log(freq)));
protected double tf(BasicStats stats, double freq, double docLen) {
if (freq <= 0.0) return 0.0;
return 1 + Math.log(1 + Math.log(freq));
}
/**
* compute the document length component
*/
@Override
protected float ln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double ln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the mixed term frequency and document length component
*/
@Override
protected float tfln(BasicStats stats, float freq, float docLen) {
return 1f;
protected double tfln(BasicStats stats, double freq, double docLen) {
return 1.0;
}
/**
* compute the inverted document frequency component
*/
@Override
protected float idf(BasicStats stats, float freq, float docLen) {
return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
protected double idf(BasicStats stats, double freq, double docLen) {
return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq());
}
/**
* compute the gamma component
*/
@Override
protected float gamma(BasicStats stats, float freq, float docLen) {
protected double gamma(BasicStats stats, double freq, double docLen) {
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
}
}

View File

@ -159,9 +159,9 @@ public class BM25Similarity extends Similarity {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
Explanation.match(df, "docFreq"),
Explanation.match(docCount, "docCount"));
return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
Explanation.match(df, "n, number of documents containing term"),
Explanation.match(docCount, "N, total number of documents with field"));
}
/**
@ -185,7 +185,7 @@ public class BM25Similarity extends Similarity {
details.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match((float) idf, "idf(), sum of:", details);
return Explanation.match((float) idf, "idf, sum of:", details);
}
@Override
@ -197,7 +197,7 @@ public class BM25Similarity extends Similarity {
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache);
}
@Override
@ -217,7 +217,7 @@ public class BM25Similarity extends Similarity {
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1);
this.weightValue = stats.weight;
this.norms = norms;
lengthCache = LENGTH_TABLE;
cache = stats.cache;
@ -226,7 +226,7 @@ public class BM25Similarity extends Similarity {
@Override
public float score(int doc, float freq) throws IOException {
// if there are no norms, we act as if b=0
float norm;
double norm;
if (norms == null) {
norm = k1;
} else {
@ -236,12 +236,48 @@ public class BM25Similarity extends Similarity {
norm = cache[0];
}
}
return weightValue * freq / (freq + norm);
return weightValue * (float) (freq / (freq + norm));
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms, lengthCache);
List<Explanation> subs = new ArrayList<>();
subs.addAll(stats.explain());
Explanation tfExpl = explainTF(doc, freq);
subs.add(tfExpl);
return Explanation.match(stats.weight * tfExpl.getValue(),
"score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs);
}
private Explanation explainTF(int doc, Explanation freq) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
if (norms == null) {
subs.add(Explanation.match(0, "b, field omits length norms"));
return Explanation.match(
(float) (freq.getValue() / (freq.getValue() + (double) k1)),
"tf, computed as freq / (freq + k1) from:", subs);
} else {
byte norm;
if (norms.advanceExact(doc)) {
norm = (byte) norms.longValue();
} else {
norm = 0;
}
float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "b, length normalization parameter"));
if ((norm & 0xFF) > 39) {
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
} else {
subs.add(Explanation.match(doclen, "dl, length of field"));
}
subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field"));
float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl);
return Explanation.match(
(float) (freq.getValue() / (freq.getValue() + (double) normValue)),
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
}
}
@Override
@ -257,69 +293,45 @@ public class BM25Similarity extends Similarity {
/** Collection statistics for the BM25 model. */
private static class BM25Stats extends SimWeight {
/** field name, for pulling norms */
private final String field;
/** query boost */
private final float boost;
/** k1 value for scale factor */
private final float k1;
/** BM25's idf */
private final Explanation idf;
/** The average document length. */
private final float avgdl;
/** query boost */
private final float boost;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
/** weight (idf * boost) */
private final float weight;
/** field name, for pulling norms */
private final String field;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
* for LENGTH_TABLE */
private final float[] cache;
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) {
BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) {
this.field = field;
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.weight = idf.getValue() * boost;
this.k1 = k1;
this.cache = cache;
this.weight = (k1 + 1) * boost * idf.getValue();
}
}
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "parameter k1"));
if (norms == null) {
subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
} else {
byte norm;
if (norms.advanceExact(doc)) {
norm = (byte) norms.longValue();
} else {
norm = 0;
private List<Explanation> explain() {
List<Explanation> subs = new ArrayList<>();
// scale factor
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
// query boost
if (boost != 1.0f) {
subs.add(Explanation.match(boost, "boost"));
}
float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "parameter b"));
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
subs.add(Explanation.match(doclen, "fieldLength"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
// idf
subs.add(idf);
return subs;
}
}
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
Explanation boostExpl = Explanation.match(stats.boost, "boost");
List<Explanation> subs = new ArrayList<>();
if (boostExpl.getValue() != 1.0f)
subs.add(boostExpl);
subs.add(stats.idf);
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
subs.add(tfNormExpl);
return Explanation.match(
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
"score(doc="+doc+",freq="+freq+"), product of:", subs);
}
@Override
public String toString() {

View File

@ -37,7 +37,7 @@ public abstract class BasicModel {
public BasicModel() {}
/** Returns the informative content score. */
public abstract float score(BasicStats stats, float tfn);
public abstract double score(BasicStats stats, double tfn);
/**
* Returns an explanation for the score.
@ -46,9 +46,9 @@ public abstract class BasicModel {
* explanation for such models. Subclasses that use other statistics must
* override this method.</p>
*/
public Explanation explain(BasicStats stats, float tfn) {
public Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
score(stats, tfn),
(float) score(stats, tfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));

View File

@ -35,11 +35,11 @@ public class BasicModelBE extends BasicModel {
public BasicModelBE() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
double F = stats.getTotalTermFreq() + 1 + tfn;
// approximation only holds true when F << N, so we use N += F
double N = F + stats.getNumberOfDocuments();
return (float)(-log2((N - 1) * Math.E)
return (-log2((N - 1) * Math.E)
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
}

View File

@ -37,16 +37,16 @@ public class BasicModelD extends BasicModel {
public BasicModelD() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
// to create a 'normalized' F.
double F = stats.getTotalTermFreq() + 1 + tfn;
double phi = (double)tfn / F;
double phi = tfn / F;
double nphi = 1 - phi;
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi));
return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi);
}
@Override

View File

@ -31,13 +31,13 @@ public class BasicModelG extends BasicModel {
public BasicModelG() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
double F = stats.getTotalTermFreq() + 1;
double N = stats.getNumberOfDocuments();
double lambda = F / (N + F);
// -log(1 / (lambda + 1)) -> log(lambda + 1)
return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda);
}
@Override

View File

@ -29,10 +29,10 @@ public class BasicModelIF extends BasicModel {
public BasicModelIF() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
return tfn * log2(1 + (N + 1) / (F + 0.5));
}
@Override

View File

@ -30,16 +30,16 @@ public class BasicModelIn extends BasicModel {
public BasicModelIn() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
long N = stats.getNumberOfDocuments();
long n = stats.getDocFreq();
return tfn * (float)(log2((N + 1) / (n + 0.5)));
return tfn * log2((N + 1) / (n + 0.5));
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
score(stats, tfn),
(float) score(stats, tfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getDocFreq(), "docFreq"));

View File

@ -30,11 +30,11 @@ public class BasicModelIne extends BasicModel {
public BasicModelIne() {}
@Override
public final float score(BasicStats stats, float tfn) {
public final double score(BasicStats stats, double tfn) {
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
return tfn * (float)(log2((N + 1) / (ne + 0.5)));
return tfn * log2((N + 1) / (ne + 0.5));
}
@Override

View File

@ -35,11 +35,11 @@ public class BasicModelP extends BasicModel {
public BasicModelP() {}
@Override
public final float score(BasicStats stats, float tfn) {
float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1);
return (float)(tfn * log2(tfn / lambda)
public final double score(BasicStats stats, double tfn) {
double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1);
return tfn * log2(tfn / lambda)
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
+ 0.5 * log2(2 * Math.PI * tfn));
+ 0.5 * log2(2 * Math.PI * tfn);
}
@Override

View File

@ -30,7 +30,7 @@ public class BasicStats extends Similarity.SimWeight {
/** The total number of tokens in the field. */
protected long numberOfFieldTokens;
/** The average field length. */
protected float avgFieldLength;
protected double avgFieldLength;
/** The document frequency. */
protected long docFreq;
/** The total number of occurrences of this term across all documents. */
@ -39,10 +39,10 @@ public class BasicStats extends Similarity.SimWeight {
// -------------------------- Boost-related stuff --------------------------
/** A query boost. Should be applied as a multiplicative factor to the score. */
protected final float boost;
protected final double boost;
/** Constructor. */
public BasicStats(String field, float boost) {
public BasicStats(String field, double boost) {
this.field = field;
this.boost = boost;
}
@ -76,12 +76,12 @@ public class BasicStats extends Similarity.SimWeight {
}
/** Returns the average field length. */
public float getAvgFieldLength() {
public double getAvgFieldLength() {
return avgFieldLength;
}
/** Sets the average field length. */
public void setAvgFieldLength(float avgFieldLength) {
public void setAvgFieldLength(double avgFieldLength) {
this.avgFieldLength = avgFieldLength;
}
@ -106,7 +106,7 @@ public class BasicStats extends Similarity.SimWeight {
}
/** Returns the total boost. */
public float getBoost() {
public double getBoost() {
return boost;
}
}

View File

@ -73,7 +73,7 @@ public class BooleanSimilarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
Explanation queryBoostExpl = Explanation.match(boost, "query boost");
Explanation queryBoostExpl = Explanation.match(boost, "boost");
return Explanation.match(
queryBoostExpl.getValue(),
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",

View File

@ -65,8 +65,8 @@ public class ClassicSimilarity extends TFIDFSimilarity {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq"),
Explanation.match(docCount, "docCount"));
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
/** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */

View File

@ -50,16 +50,16 @@ public class DFISimilarity extends SimilarityBase {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
protected double score(BasicStats stats, double freq, double docLen) {
final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
// if the observed frequency is less than or equal to the expected value, then return zero.
if (freq <= expected) return 0;
final float measure = independence.score(freq, expected);
final double measure = independence.score(freq, expected);
return stats.getBoost() * (float) log2(measure + 1);
return stats.getBoost() * log2(measure + 1);
}
/**

View File

@ -107,17 +107,17 @@ public class DFRSimilarity extends SimilarityBase {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
float tfn = normalization.tfn(stats, freq, docLen);
protected double score(BasicStats stats, double freq, double docLen) {
double tfn = normalization.tfn(stats, freq, docLen);
return stats.getBoost() *
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
}
@Override
protected void explain(List<Explanation> subs,
BasicStats stats, int doc, float freq, float docLen) {
if (stats.getBoost() != 1.0f) {
subs.add(Explanation.match(stats.getBoost(), "boost"));
BasicStats stats, int doc, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match( (float)stats.getBoost(), "boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);

View File

@ -34,13 +34,13 @@ public abstract class Distribution {
public Distribution() {}
/** Computes the score. */
public abstract float score(BasicStats stats, float tfn, float lambda);
public abstract double score(BasicStats stats, double tfn, double lambda);
/** Explains the score. Returns the name of the model only, since
* both {@code tfn} and {@code lambda} are explained elsewhere. */
public Explanation explain(BasicStats stats, float tfn, float lambda) {
return Explanation.match(
score(stats, tfn, lambda), getClass().getSimpleName());
public Explanation explain(BasicStats stats, double tfn, double lambda) {
return Explanation.match((float)score(stats, tfn, lambda),
getClass().getSimpleName());
}
/**

View File

@ -30,8 +30,8 @@ public class DistributionLL extends Distribution {
public DistributionLL() {}
@Override
public final float score(BasicStats stats, float tfn, float lambda) {
return (float)-Math.log(lambda / (tfn + lambda));
public final double score(BasicStats stats, double tfn, double lambda) {
return -Math.log(lambda / (tfn + lambda));
}
@Override

View File

@ -33,11 +33,11 @@ public class DistributionSPL extends Distribution {
public DistributionSPL() {}
@Override
public final float score(BasicStats stats, float tfn, float lambda) {
if (lambda == 1f) {
lambda = 0.99f;
public final double score(BasicStats stats, double tfn, double lambda) {
if (lambda == 1d) {
lambda = 0.99d;
}
return (float)-Math.log(
return -Math.log(
(Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda));
}

View File

@ -95,7 +95,7 @@ public class IBSimilarity extends SimilarityBase {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
protected double score(BasicStats stats, double freq, double docLen) {
return stats.getBoost() *
distribution.score(
stats,
@ -105,9 +105,9 @@ public class IBSimilarity extends SimilarityBase {
@Override
protected void explain(
List<Explanation> subs, BasicStats stats, int doc, float freq, float docLen) {
if (stats.getBoost() != 1.0f) {
subs.add(Explanation.match(stats.getBoost(), "boost"));
List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float)stats.getBoost(), "boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
Explanation lambdaExpl = lambda.explain(stats);

View File

@ -38,7 +38,7 @@ public abstract class Independence {
* @param freq actual term frequency
* @param expected expected term frequency
*/
public abstract float score(float freq, float expected);
public abstract double score(double freq, double expected);
// subclasses must provide a name
@Override

View File

@ -33,7 +33,7 @@ public class IndependenceChiSquared extends Independence {
public IndependenceChiSquared() {}
@Override
public float score(float freq, float expected) {
public double score(double freq, double expected) {
return (freq - expected) * (freq - expected) / expected;
}

View File

@ -32,7 +32,7 @@ public class IndependenceSaturated extends Independence {
public IndependenceSaturated() {}
@Override
public float score(float freq, float expected) {
public double score(double freq, double expected) {
return (freq - expected) / expected;
}

View File

@ -34,8 +34,8 @@ public class IndependenceStandardized extends Independence {
public IndependenceStandardized() {}
@Override
public float score(float freq, float expected) {
return (freq - expected) / (float) Math.sqrt(expected);
public double score(double freq, double expected) {
return (freq - expected) / Math.sqrt(expected);
}
@Override

View File

@ -44,11 +44,17 @@ public class LMDirichletSimilarity extends LMSimilarity {
/** Instantiates the similarity with the provided &mu; parameter. */
public LMDirichletSimilarity(CollectionModel collectionModel, float mu) {
super(collectionModel);
if (Float.isFinite(mu) == false || mu < 0) {
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
}
this.mu = mu;
}
/** Instantiates the similarity with the provided &mu; parameter. */
public LMDirichletSimilarity(float mu) {
if (Float.isFinite(mu) == false || mu < 0) {
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
}
this.mu = mu;
}
@ -63,18 +69,18 @@ public class LMDirichletSimilarity extends LMSimilarity {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
float score = stats.getBoost() * (float)(Math.log(1 + freq /
protected double score(BasicStats stats, double freq, double docLen) {
double score = stats.getBoost() * (Math.log(1 + freq /
(mu * ((LMStats)stats).getCollectionProbability())) +
Math.log(mu / (docLen + mu)));
return score > 0.0f ? score : 0.0f;
return score > 0.0d ? score : 0.0d;
}
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
float freq, float docLen) {
if (stats.getBoost() != 1.0f) {
subs.add(Explanation.match(stats.getBoost(), "boost"));
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
}
subs.add(Explanation.match(mu, "mu"));

View File

@ -31,7 +31,9 @@ import org.apache.lucene.search.Explanation;
* <p>The model has a single parameter, &lambda;. According to said paper, the
* optimal value depends on both the collection and the query. The optimal value
* is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
*
* <p>Values should be between 0 (exclusive) and 1 (inclusive). Values near zero act score more
* like a conjunction (coordinate level matching), whereas values near 1 behave
* the opposite (more like pure disjunction).
* @lucene.experimental
*/
public class LMJelinekMercerSimilarity extends LMSimilarity {
@ -42,27 +44,33 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
public LMJelinekMercerSimilarity(
CollectionModel collectionModel, float lambda) {
super(collectionModel);
if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) {
throw new IllegalArgumentException("lambda must be in the range (0 .. 1]");
}
this.lambda = lambda;
}
/** Instantiates with the specified &lambda; parameter. */
public LMJelinekMercerSimilarity(float lambda) {
if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) {
throw new IllegalArgumentException("lambda must be in the range (0 .. 1]");
}
this.lambda = lambda;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
protected double score(BasicStats stats, double freq, double docLen) {
return stats.getBoost() *
(float)Math.log(1 +
Math.log(1 +
((1 - lambda) * freq / docLen) /
(lambda * ((LMStats)stats).getCollectionProbability()));
}
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
float freq, float docLen) {
if (stats.getBoost() != 1.0f) {
subs.add(Explanation.match(stats.getBoost(), "boost"));
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
}
subs.add(Explanation.match(lambda, "lambda"));
super.explain(subs, stats, doc, freq, docLen);

View File

@ -54,7 +54,7 @@ public abstract class LMSimilarity extends SimilarityBase {
}
@Override
protected BasicStats newStats(String field, float boost) {
protected BasicStats newStats(String field, double boost) {
return new LMStats(field, boost);
}
@ -71,8 +71,8 @@ public abstract class LMSimilarity extends SimilarityBase {
@Override
protected void explain(List<Explanation> subExpls, BasicStats stats, int doc,
float freq, float docLen) {
subExpls.add(Explanation.match(collectionModel.computeProbability(stats),
double freq, double docLen) {
subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats),
"collection probability"));
}
@ -103,12 +103,12 @@ public abstract class LMSimilarity extends SimilarityBase {
/** Stores the collection distribution of the current term. */
public static class LMStats extends BasicStats {
/** The probability that the current term is generated by the collection. */
private float collectionProbability;
private double collectionProbability;
/**
* Creates LMStats for the provided field and query-time boost
*/
public LMStats(String field, float boost) {
public LMStats(String field, double boost) {
super(field, boost);
}
@ -116,7 +116,7 @@ public abstract class LMSimilarity extends SimilarityBase {
* Returns the probability that the current term is generated by the
* collection.
*/
public final float getCollectionProbability() {
public final double getCollectionProbability() {
return collectionProbability;
}
@ -124,7 +124,7 @@ public abstract class LMSimilarity extends SimilarityBase {
* Sets the probability that the current term is generated by the
* collection.
*/
public final void setCollectionProbability(float collectionProbability) {
public final void setCollectionProbability(double collectionProbability) {
this.collectionProbability = collectionProbability;
}
}
@ -135,7 +135,7 @@ public abstract class LMSimilarity extends SimilarityBase {
* Computes the probability {@code p(w|C)} according to the language model
* strategy for the current term.
*/
public float computeProbability(BasicStats stats);
public double computeProbability(BasicStats stats);
/** The name of the collection model strategy. */
public String getName();
@ -151,8 +151,8 @@ public abstract class LMSimilarity extends SimilarityBase {
public DefaultCollectionModel() {}
@Override
public float computeProbability(BasicStats stats) {
return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F);
public double computeProbability(BasicStats stats) {
return (stats.getTotalTermFreq()+1D) / (stats.getNumberOfFieldTokens()+1D);
}
@Override

View File

@ -36,7 +36,7 @@ public abstract class Normalization {
/** Returns the normalized term frequency.
* @param len the field length. */
public abstract float tfn(BasicStats stats, float tf, float len);
public abstract double tfn(BasicStats stats, double tf, double len);
/** Returns an explanation for the normalized term frequency.
* <p>The default normalization methods use the field length of the document
@ -44,13 +44,13 @@ public abstract class Normalization {
* This method provides a generic explanation for such methods.
* Subclasses that use other statistics must override this method.</p>
*/
public Explanation explain(BasicStats stats, float tf, float len) {
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
tfn(stats, tf, len),
(float) tfn(stats, tf, len),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(tf, "tf"),
Explanation.match(stats.getAvgFieldLength(), "avgFieldLength"),
Explanation.match(len, "len"));
Explanation.match((float) tf, "tf"),
Explanation.match((float) stats.getAvgFieldLength(), "avgFieldLength"),
Explanation.match((float) len, "len"));
}
/** Implementation used when there is no normalization. */
@ -60,12 +60,12 @@ public abstract class Normalization {
public NoNormalization() {}
@Override
public final float tfn(BasicStats stats, float tf, float len) {
public double tfn(BasicStats stats, double tf, double len) {
return tf;
}
@Override
public final Explanation explain(BasicStats stats, float tf, float len) {
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(1, "no normalization");
}

View File

@ -36,6 +36,10 @@ public class NormalizationH1 extends Normalization {
* normalization with respect to the document length.
*/
public NormalizationH1(float c) {
// unbounded but typical range 0..10 or so
if (Float.isFinite(c) == false || c < 0) {
throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value");
}
this.c = c;
}
@ -47,8 +51,8 @@ public class NormalizationH1 extends Normalization {
}
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return tf * c * stats.getAvgFieldLength() / len;
public final double tfn(BasicStats stats, double tf, double len) {
return tf * c * (stats.getAvgFieldLength() / len);
}
@Override

View File

@ -38,6 +38,10 @@ public class NormalizationH2 extends Normalization {
* normalization with respect to the document length.
*/
public NormalizationH2(float c) {
// unbounded but typical range 0..10 or so
if (Float.isFinite(c) == false || c < 0) {
throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value");
}
this.c = c;
}
@ -49,8 +53,8 @@ public class NormalizationH2 extends Normalization {
}
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
public final double tfn(BasicStats stats, double tf, double len) {
return tf * log2(1 + c * stats.getAvgFieldLength() / len);
}
@Override

View File

@ -36,11 +36,14 @@ public class NormalizationH3 extends Normalization {
* @param mu smoothing parameter <code>&mu;</code>
*/
public NormalizationH3(float mu) {
if (Float.isFinite(mu) == false || mu < 0) {
throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value");
}
this.mu = mu;
}
@Override
public float tfn(BasicStats stats, float tf, float len) {
public double tfn(BasicStats stats, double tf, double len) {
return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu;
}

View File

@ -34,15 +34,18 @@ public class NormalizationZ extends Normalization {
/**
* Creates NormalizationZ with the supplied parameter <code>z</code>.
* @param z represents <code>A/(A+1)</code> where <code>A</code>
* measures the specificity of the language.
* measures the specificity of the language. It ranges from (0 .. 0.5)
*/
public NormalizationZ(float z) {
if (Float.isNaN(z) || z <= 0f || z >= 0.5f) {
throw new IllegalArgumentException("illegal z value: " + z + ", must be in the range (0 .. 0.5)");
}
this.z = z;
}
@Override
public float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * Math.pow(stats.avgFieldLength / len, z));
public double tfn(BasicStats stats, double tf, double len) {
return tf * Math.pow(stats.avgFieldLength / len, z);
}
@Override

View File

@ -34,7 +34,7 @@ import org.apache.lucene.util.SmallFloat;
* A subclass of {@code Similarity} that provides a simplified API for its
* descendants. Subclasses are only required to implement the {@link #score}
* and {@link #toString()} methods. Implementing
* {@link #explain(List, BasicStats, int, float, float)} is optional,
* {@link #explain(List, BasicStats, int, double, double)} is optional,
* inasmuch as SimilarityBase already provides a basic explanation of the score
* and the term frequency. However, implementers of a subclass are encouraged to
* include as much detail about the scoring method as possible.
@ -93,7 +93,7 @@ public abstract class SimilarityBase extends Similarity {
}
/** Factory method to return a custom stats object */
protected BasicStats newStats(String field, float boost) {
protected BasicStats newStats(String field, double boost) {
return new BasicStats(field, boost);
}
@ -113,7 +113,7 @@ public abstract class SimilarityBase extends Similarity {
}
final long numberOfFieldTokens;
final float avgFieldLength;
final double avgFieldLength;
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
@ -145,7 +145,7 @@ public abstract class SimilarityBase extends Similarity {
* @param docLen the document length.
* @return the score.
*/
protected abstract float score(BasicStats stats, float freq, float docLen);
protected abstract double score(BasicStats stats, double freq, double docLen);
/**
* Subclasses should implement this method to explain the score. {@code expl}
@ -161,16 +161,16 @@ public abstract class SimilarityBase extends Similarity {
* @param docLen the document length.
*/
protected void explain(
List<Explanation> subExpls, BasicStats stats, int doc, float freq, float docLen) {}
List<Explanation> subExpls, BasicStats stats, int doc, double freq, double docLen) {}
/**
* Explains the score. The implementation here provides a basic explanation
* in the format <em>score(name-of-similarity, doc=doc-id,
* freq=term-frequency), computed from:</em>, and
* attaches the score (computed via the {@link #score(BasicStats, float, float)}
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
* method) and the explanation for the term frequency. Subclasses content with
* this format may add additional details in
* {@link #explain(List, BasicStats, int, float, float)}.
* {@link #explain(List, BasicStats, int, double, double)}.
*
* @param stats the corpus level statistics.
* @param doc the document id.
@ -179,12 +179,12 @@ public abstract class SimilarityBase extends Similarity {
* @return the explanation.
*/
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, float docLen) {
BasicStats stats, int doc, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue(), docLen);
return Explanation.match(
score(stats, freq.getValue(), docLen),
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
subs);
}
@ -248,8 +248,8 @@ public abstract class SimilarityBase extends Similarity {
/** Delegates the {@link #score(int, float)} and
* {@link #explain(int, Explanation)} methods to
* {@link SimilarityBase#score(BasicStats, float, float)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
* {@link SimilarityBase#score(BasicStats, double, double)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, double)},
* respectively.
*/
final class BasicSimScorer extends SimScorer {
@ -261,9 +261,9 @@ public abstract class SimilarityBase extends Similarity {
this.norms = norms;
}
float getLengthValue(int doc) throws IOException {
double getLengthValue(int doc) throws IOException {
if (norms == null) {
return 1F;
return 1D;
}
if (norms.advanceExact(doc)) {
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
@ -275,7 +275,7 @@ public abstract class SimilarityBase extends Similarity {
@Override
public float score(int doc, float freq) throws IOException {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc));
}
@Override

View File

@ -450,7 +450,9 @@ public abstract class TFIDFSimilarity extends Similarity {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
return Explanation.match(idf, "idf(docFreq, docCount)",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
/**
@ -643,20 +645,37 @@ public abstract class TFIDFSimilarity extends Similarity {
"fieldNorm(doc=" + doc + ")");
return Explanation.match(
tfExplanation.getValue() * stats.idf.getValue() * fieldNormExpl.getValue(),
tfExplanation.getValue() * fieldNormExpl.getValue(),
"fieldWeight in " + doc + ", product of:",
tfExplanation, stats.idf, fieldNormExpl);
tfExplanation, fieldNormExpl);
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation queryExpl = Explanation.match(stats.boost, "boost");
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
if (stats.boost == 1f) {
return fieldExpl;
List<Explanation> subs = new ArrayList<Explanation>();
if (stats.boost != 1F) {
subs.add(Explanation.match(stats.boost, "boost"));
}
subs.add(stats.idf);
Explanation tf = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
subs.add(tf);
float norm;
if (norms == null) {
norm = 1f;
} else if (norms.advanceExact(doc) == false) {
norm = 0f;
} else {
norm = normTable[(int) (norms.longValue() & 0xFF)];
}
Explanation fieldNorm = Explanation.match(
norm,
"fieldNorm(doc=" + doc + ")");
subs.add(fieldNorm);
return Explanation.match(
queryExpl.getValue() * fieldExpl.getValue(),
stats.queryWeight * tf.getValue() * norm,
"score(doc="+doc+",freq="+freq.getValue()+"), product of:",
queryExpl, fieldExpl);
subs);
}
}

View File

@ -97,7 +97,7 @@
* The easiest way to quickly implement a new ranking method is to extend
* {@link org.apache.lucene.search.similarities.SimilarityBase}, which provides
* basic implementations for the low level . Subclasses are only required to
* implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)}
* implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, double, double)}
* and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()}
* methods.
*

View File

@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public abstract class AxiomaticTestCase extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
// axiomatic parameter s
final float s;
switch (random.nextInt(4)) {
case 0:
// minimum value
s = 0;
break;
case 1:
// tiny value
s = Float.MIN_VALUE;
break;
case 2:
// maximum value
s = 1;
break;
default:
// random value
s = random.nextFloat();
break;
}
// axiomatic query length
final int queryLen;
switch (random.nextInt(4)) {
case 0:
// minimum value
queryLen = 0;
break;
case 1:
// tiny value
queryLen = 1;
break;
case 2:
// maximum value
queryLen = Integer.MAX_VALUE;
break;
default:
// random value
queryLen = random.nextInt(Integer.MAX_VALUE);
break;
}
// axiomatic parameter k
final float k;
switch (random.nextInt(4)) {
case 0:
// minimum value
k = 0;
break;
case 1:
// tiny value
k = Float.MIN_VALUE;
break;
case 2:
// maximum value
k = 1;
break;
default:
// random value
k = random.nextFloat();
break;
}
return getAxiomaticModel(s, queryLen, k);
}
protected abstract Similarity getAxiomaticModel(float s, int queryLen, float k);
}

View File

@ -0,0 +1,124 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
final AfterEffect afterEffect;
switch(random.nextInt(3)) {
case 0:
afterEffect = new AfterEffect.NoAfterEffect();
break;
case 1:
afterEffect = new AfterEffectL();
break;
default:
afterEffect = new AfterEffectB();
break;
}
// normalization hyper-parameter c
final float c;
switch (random.nextInt(4)) {
case 0:
// minimum value
c = 0;
break;
case 1:
// tiny value
c = Float.MIN_VALUE;
break;
case 2:
// maximum value
// we just limit the test to "reasonable" c values but don't enforce this anywhere.
c = Integer.MAX_VALUE;
break;
default:
// random value
c = Integer.MAX_VALUE * random.nextFloat();
break;
}
// normalization hyper-parameter z
final float z;
switch (random.nextInt(3)) {
case 0:
// minimum value
z = Float.MIN_VALUE;
break;
case 1:
// maximum value
z = Math.nextDown(0.5f);
break;
default:
// random value
float zcand = random.nextFloat() / 2;
if (zcand == 0f) {
// nextFloat returns 0 inclusive, we have to avoid it.
z = Math.nextUp(zcand);
} else {
z = zcand;
}
}
// dirichlet parameter mu
final float mu;
switch (random.nextInt(4)) {
case 0:
// minimum value
mu = 0;
break;
case 1:
// tiny value
mu = Float.MIN_VALUE;
break;
case 2:
// maximum value
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
mu = Integer.MAX_VALUE;
break;
default:
// random value
mu = Integer.MAX_VALUE * random.nextFloat();
break;
}
final Normalization normalization;
switch(random.nextInt(5)) {
case 0:
normalization = new Normalization.NoNormalization();
break;
case 1:
normalization = new NormalizationH1(c);
break;
case 2:
normalization = new NormalizationH2(c);
break;
case 3:
normalization = new NormalizationH3(mu);
break;
default:
normalization = new NormalizationZ(z);
break;
}
return new DFRSimilarity(getBasicModel(), afterEffect, normalization);
}
/** return BasicModel under test */
protected abstract BasicModel getBasicModel();
}

View File

@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public abstract class DistributionTestCase extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
final Lambda lambda;
if (random.nextBoolean()) {
lambda = new LambdaDF();
} else {
lambda = new LambdaTTF();
}
// normalization hyper-parameter c
final float c;
switch (random.nextInt(4)) {
case 0:
// minimum value
c = 0;
break;
case 1:
// tiny value
c = Float.MIN_VALUE;
break;
case 2:
// maximum value
// we just limit the test to "reasonable" c values but don't enforce this anywhere.
c = Integer.MAX_VALUE;
break;
default:
// random value
c = Integer.MAX_VALUE * random.nextFloat();
break;
}
// normalization hyper-parameter z
final float z;
switch (random.nextInt(3)) {
case 0:
// minimum value
z = Float.MIN_VALUE;
break;
case 1:
// maximum value
z = Math.nextDown(0.5f);
break;
default:
// random value
float zcand = random.nextFloat() / 2;
if (zcand == 0f) {
// nextFloat returns 0 inclusive, we have to avoid it.
z = Math.nextUp(zcand);
} else {
z = zcand;
}
}
// dirichlet parameter mu
final float mu;
switch (random.nextInt(4)) {
case 0:
// minimum value
mu = 0;
break;
case 1:
// tiny value
mu = Float.MIN_VALUE;
break;
case 2:
// maximum value
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
mu = Integer.MAX_VALUE;
break;
default:
// random value
mu = Integer.MAX_VALUE * random.nextFloat();
break;
}
final Normalization normalization;
switch(random.nextInt(5)) {
case 0:
normalization = new Normalization.NoNormalization();
break;
case 1:
normalization = new NormalizationH1(c);
break;
case 2:
normalization = new NormalizationH2(c);
break;
case 3:
normalization = new NormalizationH3(mu);
break;
default:
normalization = new NormalizationZ(z);
break;
}
return new IBSimilarity(getDistribution(), lambda, normalization);
}
/** return BasicModel under test */
protected abstract Distribution getDistribution();
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns NaN scores for sloppy freqs < 1 (due to log without floor)
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF1EXP extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
return new AxiomaticF1EXP(s, k);
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns NaN scores for sloppy freqs < 1 (due to log without floor)
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF1LOG extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
return new AxiomaticF1LOG(s);
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestAxiomaticF2EXP extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
return new AxiomaticF2EXP(s, k);
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestAxiomaticF2LOG extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
return new AxiomaticF2LOG(s);
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns negative scores at least, but it (now) warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF3EXP extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
// TODO: use the randomized parameters and not these hardcoded ones
return new AxiomaticF3EXP(0.25f, 1);
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns negative scores at least, but it (now) warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF3LOG extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
// TODO: use the randomized parameters and not these hardcoded ones
return new AxiomaticF3LOG(0.25f, 1);
}
}

View File

@ -17,10 +17,9 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.util.LuceneTestCase;
import java.util.Random;
public class TestBM25Similarity extends LuceneTestCase {
public class TestBM25Similarity extends BaseSimilarityTestCase {
public void testIllegalK1() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
@ -61,17 +60,51 @@ public class TestBM25Similarity extends LuceneTestCase {
assertTrue(expected.getMessage().contains("illegal b value"));
}
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().equals(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
@Override
protected Similarity getSimilarity(Random random) {
// term frequency normalization parameter k1
final float k1;
switch (random.nextInt(4)) {
case 0:
// minimum value
k1 = 0;
break;
case 1:
// tiny value
k1 = Float.MIN_VALUE;
break;
case 2:
// maximum value
// upper bounds on individual term's score is 43.262806 * (k1 + 1) * boost
// we just limit the test to "reasonable" k1 values but don't enforce this anywhere.
k1 = Integer.MAX_VALUE;
break;
default:
// random value
k1 = Integer.MAX_VALUE * random.nextFloat();
break;
}
return null;
// length normalization parameter b [0 .. 1]
final float b;
switch (random.nextInt(4)) {
case 0:
// minimum value
b = 0;
break;
case 1:
// tiny value
b = Float.MIN_VALUE;
break;
case 2:
// maximum value
b = 1;
break;
default:
// random value
b = random.nextFloat();
break;
}
return new BM25Similarity(k1, b);
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns negative scores at least, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelBE extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelBE();
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// scores go backwards with respect to TF, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelD extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelD();
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestBasicModelG extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelG();
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestBasicModelIF extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelIF();
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestBasicModelIn extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelIn();
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestBasicModelIne extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelIne();
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
//scores go backwards with respect to TF, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelP extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelP();
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
@ -32,11 +33,10 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestBooleanSimilarity extends LuceneTestCase {
public class TestBooleanSimilarity extends BaseSimilarityTestCase {
public void testTermScoreIsEqualToBoost() throws IOException {
Directory dir = newDirectory();
@ -114,4 +114,9 @@ public class TestBooleanSimilarity extends LuceneTestCase {
0f);
}
}
@Override
protected Similarity getSimilarity(Random random) {
return new BooleanSimilarity();
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
@ -39,11 +40,10 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestClassicSimilarity extends LuceneTestCase {
public class TestClassicSimilarity extends BaseSimilarityTestCase {
private Directory directory;
private IndexReader indexReader;
private IndexSearcher indexSearcher;
@ -185,4 +185,9 @@ public class TestClassicSimilarity extends LuceneTestCase {
0f);
}
}
@Override
protected Similarity getSimilarity(Random random) {
return new ClassicSimilarity();
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
public class TestDistributionLL extends DistributionTestCase {
@Override
protected Distribution getDistribution() {
return new DistributionLL();
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// scores go infinite, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestDistributionSPL extends DistributionTestCase {
@Override
protected Distribution getDistribution() {
return new DistributionSPL();
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public class TestIndependenceChiSquared extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
return new DFISimilarity(new IndependenceChiSquared());
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public class TestIndependenceSaturated extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
return new DFISimilarity(new IndependenceSaturated());
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public class TestIndependenceStandardized extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
return new DFISimilarity(new IndependenceStandardized());
}
}

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public class TestLMDirichletSimilarity extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
// smoothing parameter mu, unbounded
final float mu;
switch (random.nextInt(4)) {
case 0:
// minimum value
mu = 0;
break;
case 1:
// tiny value
mu = Float.MIN_VALUE;
break;
case 2:
// maximum value
// we just limit the test to "reasonable" mu values but don't enforce this anywhere.
mu = Integer.MAX_VALUE;
break;
default:
// random value
mu = Integer.MAX_VALUE * random.nextFloat();
break;
}
return new LMDirichletSimilarity(mu);
}
}

View File

@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.Random;
public class TestLMJelinekMercerSimilarity extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
// smoothing parameter lambda: (0..1]
final float lambda;
switch (random.nextInt(3)) {
case 0:
// tiny value
lambda = Float.MIN_VALUE;
break;
case 1:
// maximum value
lambda = 1;
break;
default:
// random value
lambda = random.nextFloat();
break;
}
return new LMJelinekMercerSimilarity(lambda);
}
}

View File

@ -51,7 +51,7 @@ import org.apache.lucene.util.Version;
* items in the list. If a test case fails, the name of the Similarity that
* caused the failure is returned as part of the assertion error message.</p>
* <p>Unit testing is performed by constructing statistics manually and calling
* the {@link SimilarityBase#score(BasicStats, float, float)} method of the
* the {@link SimilarityBase#score(BasicStats, double, double)} method of the
* Similarities. The statistics represent corner cases of corpus distributions.
* </p>
* <p>For the integration tests, a small (8-document) collection is indexed. The
@ -191,17 +191,17 @@ public class TestSimilarityBase extends LuceneTestCase {
}
/**
* The generic test core called by all unit test methods. It calls the
* {@link SimilarityBase#score(BasicStats, float, float)} method of all
* {@link SimilarityBase#score(BasicStats, double, double)} method of all
* Similarities in {@link #sims} and checks if the score is valid; i.e. it
* is a finite positive real number.
*/
private void unitTestCore(BasicStats stats, float freq, int docLen) {
for (SimilarityBase sim : sims) {
BasicStats realStats = (BasicStats) sim.computeWeight(
stats.getBoost(),
(float)stats.getBoost(),
toCollectionStats(stats),
toTermStats(stats));
float score = sim.score(realStats, freq, docLen);
float score = (float)sim.score(realStats, freq, docLen);
float explScore = sim.explain(
realStats, 1, Explanation.match(freq, "freq"), docLen).getValue();
assertFalse("Score infinite: " + sim.toString(), Float.isInfinite(score));
@ -524,17 +524,17 @@ public class TestSimilarityBase extends LuceneTestCase {
/**
* The generic test core called by all correctness test methods. It calls the
* {@link SimilarityBase#score(BasicStats, float, float)} method of all
* {@link SimilarityBase#score(BasicStats, double, double)} method of all
* Similarities in {@link #sims} and compares the score against the manually
* computed {@code gold}.
*/
private void correctnessTestCore(SimilarityBase sim, float gold) {
BasicStats stats = createStats();
BasicStats realStats = (BasicStats) sim.computeWeight(
stats.getBoost(),
(float)stats.getBoost(),
toCollectionStats(stats),
toTermStats(stats));
float score = sim.score(realStats, FREQ, DOC_LEN);
float score = (float) sim.score(realStats, FREQ, DOC_LEN);
assertEquals(
sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON);
}

View File

@ -1484,7 +1484,7 @@ public class TestBlockJoin extends LuceneTestCase {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
protected double score(BasicStats stats, double freq, double docLen) {
return freq;
}
};

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.Random;
import junit.framework.Assert;
@ -318,6 +319,8 @@ public class CheckHits {
public static float explainToleranceDelta(float f1, float f2) {
return Math.max(EXPLAIN_SCORE_TOLERANCE_MINIMUM, Math.max(Math.abs(f1), Math.abs(f2)) * EXPLAIN_SCORE_TOLERANCE_DELTA);
}
private static final Pattern COMPUTED_FROM_PATTERN = Pattern.compile(".*, computed as .* from:");
/**
* Assert that an explanation has the expected score, and optionally that its
@ -335,9 +338,12 @@ public class CheckHits {
boolean deep,
Explanation expl) {
float value = expl.getValue();
Assert.assertEquals(q+": score(doc="+doc+")="+score+
" != explanationScore="+value+" Explanation: "+expl,
score,value,explainToleranceDelta(score, value));
// TODO: clean this up if we use junit 5 (the assert message is costly)
try {
Assert.assertEquals(score, value, explainToleranceDelta(score, value));
} catch (Exception e) {
Assert.fail(q+": score(doc="+doc+")="+score+" != explanationScore="+value+" Explanation: "+expl);
}
if (!deep) return;
@ -368,7 +374,7 @@ public class CheckHits {
boolean productOf = descr.endsWith("product of:");
boolean sumOf = descr.endsWith("sum of:");
boolean maxOf = descr.endsWith("max of:");
boolean computedOf = descr.matches(".*, computed as .* from:");
boolean computedOf = descr.indexOf("computed as") > 0 && COMPUTED_FROM_PATTERN.matcher(descr).matches();
boolean maxTimesOthers = false;
if (!(productOf || sumOf || maxOf || computedOf)) {
// maybe 'max plus x times others'
@ -386,11 +392,12 @@ public class CheckHits {
}
}
// TODO: this is a TERRIBLE assertion!!!!
Assert.assertTrue(
q+": multi valued explanation description=\""+descr
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
+" or 'sum of:' or 'max of:' - "+expl,
productOf || sumOf || maxOf || computedOf || maxTimesOthers);
if (false == (productOf || sumOf || maxOf || computedOf || maxTimesOthers)) {
Assert.fail(
q+": multi valued explanation description=\""+descr
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
+" or 'sum of:' or 'max of:' - "+expl);
}
float sum = 0;
float product = 1;
float max = 0;
@ -414,9 +421,13 @@ public class CheckHits {
Assert.assertTrue("should never get here!", computedOf);
combined = value;
}
Assert.assertEquals(q+": actual subDetails combined=="+combined+
" != value="+value+" Explanation: "+expl,
combined,value,explainToleranceDelta(combined, value));
// TODO: clean this up if we use junit 5 (the assert message is costly)
try {
Assert.assertEquals(combined, value, explainToleranceDelta(combined, value));
} catch (Exception e) {
Assert.fail(q+": actual subDetails combined=="+combined+
" != value="+value+" Explanation: "+expl);
}
}
}
}

View File

@ -0,0 +1,473 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity.SimWeight;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/**
* Abstract class to do basic tests for a similarity.
* NOTE: This test focuses on the similarity impl, nothing else.
* The [stretch] goal is for this test to be
* so thorough in testing a new Similarity that if this
* test passes, then all Lucene/Solr tests should also pass. Ie,
* if there is some bug in a given Similarity that this
* test fails to catch then this test needs to be improved! */
public abstract class BaseSimilarityTestCase extends LuceneTestCase {
static LeafReader WITHOUT_NORM;
static Directory WITHOUT_NORM_DIR;
static LeafReader WITH_NORM_BASE;
static Directory WITH_NORM_DIR;
static List<LeafReader> NORM_VALUES;
@BeforeClass
public static void beforeClass() throws Exception {
// without norms
WITHOUT_NORM_DIR = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), WITHOUT_NORM_DIR);
Document doc = new Document();
doc.add(newTextField("field", "value", Field.Store.NO));
writer.addDocument(doc);
WITHOUT_NORM = getOnlyLeafReader(writer.getReader());
writer.close();
// with norms
WITH_NORM_DIR = newDirectory();
writer = new RandomIndexWriter(random(), WITH_NORM_DIR);
doc = new Document();
FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
fieldType.setOmitNorms(true);
doc.add(newField("field", "value", fieldType));
writer.addDocument(doc);
WITH_NORM_BASE = getOnlyLeafReader(writer.getReader());
writer.close();
// all possible norm values for the doc
NORM_VALUES = new ArrayList<>();
NORM_VALUES.add(WITHOUT_NORM);
for (int i = 1; i < 256; i++) {
final long value = i;
NORM_VALUES.add(new FilterLeafReader(WITH_NORM_BASE) {
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
if (field.equals("field")) {
return new CannedNorm(value);
} else {
return super.getNormValues(field);
}
}
});
}
}
@AfterClass
public static void afterClass() throws Exception {
IOUtils.close(WITH_NORM_BASE, WITH_NORM_DIR, WITHOUT_NORM, WITHOUT_NORM_DIR);
WITH_NORM_BASE = WITHOUT_NORM = null;
WITH_NORM_DIR = WITHOUT_NORM_DIR = null;
NORM_VALUES = null;
}
/** 1-document norms impl of the given value */
static class CannedNorm extends NumericDocValues {
int docID = -1;
final long value;
CannedNorm(long value) {
this.value = value;
}
@Override
public long longValue() throws IOException {
return value;
}
@Override
public boolean advanceExact(int target) throws IOException {
assert target == 0;
docID = target;
return true;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
return docID = 0;
} else {
return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (target == 0) {
return docID = 0;
} else {
return docID = NO_MORE_DOCS;
}
}
@Override
public long cost() {
return 0;
}
}
/**
* Return a new similarity with all parameters randomized within valid ranges.
*/
protected abstract Similarity getSimilarity(Random random);
static final long MAXDOC_FORTESTING = 1L << 48;
// must be at least MAXDOC_FORTESTING + Integer.MAX_VALUE
static final long MAXTOKENS_FORTESTING = 1L << 49;
/**
* returns a random corpus that is at least possible given
* the norm value for a single document.
*/
static CollectionStatistics newCorpus(Random random, int norm) {
// lower bound of tokens in the collection (you produced this norm somehow)
final int lowerBound;
if (norm == 0) {
// norms are omitted, but there must have been at least one token to produce that norm
lowerBound = 1;
} else {
// minimum value that would decode to such a norm
lowerBound = SmallFloat.byte4ToInt((byte) norm);
}
final long maxDoc;
if (random.nextBoolean()) {
// small collection
maxDoc = TestUtil.nextLong(random, 1, 100000);
} else {
// yuge collection
maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
}
// TODO: make this a mandatory statistic, or test it with -1
final long docCount;
if (random.nextBoolean()) {
// sparse field
docCount = TestUtil.nextLong(random, 1, maxDoc);
} else {
// fully populated
docCount = maxDoc;
}
// random docsize: but can't require docs to have > 2B tokens
long upperBound;
try {
upperBound = Math.min(MAXTOKENS_FORTESTING, Math.multiplyExact(docCount, Integer.MAX_VALUE));
} catch (ArithmeticException overflow) {
upperBound = MAXTOKENS_FORTESTING;
}
// TODO: make this a mandatory statistic, or test it with -1
final long sumDocFreq;
if (random.nextBoolean()) {
// shortest possible docs
sumDocFreq = docCount;
} else {
// random docsize
sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
}
final long sumTotalTermFreq;
switch (random.nextInt(3)) {
case 0:
// unsupported (e.g. omitTF)
sumTotalTermFreq = -1;
break;
case 1:
// no repetition of terms (except to satisfy this norm)
sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
break;
default:
// random repetition
assert sumDocFreq - 1 + lowerBound <= upperBound;
sumTotalTermFreq = TestUtil.nextLong(random, sumDocFreq - 1 + lowerBound, upperBound);
break;
}
return new CollectionStatistics("field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
}
private static final BytesRef TERM = new BytesRef("term");
/**
* returns new random term, that fits within the bounds of the corpus
*/
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
final long docFreq;
if (random.nextBoolean()) {
// rare term
docFreq = 1;
} else {
// random specificity
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
}
final long totalTermFreq;
if (corpus.sumTotalTermFreq() == -1) {
// omitTF
totalTermFreq = -1;
} else if (random.nextBoolean()) {
// no repetition
totalTermFreq = docFreq;
} else {
// random repetition: but can't require docs to have > 2B tokens
long upperBound;
try {
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
} catch (ArithmeticException overflow) {
upperBound = corpus.sumTotalTermFreq();
}
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
}
return new TermStatistics(TERM, docFreq, totalTermFreq);
}
/**
* Tests scoring across a bunch of random terms/corpora/frequencies for each possible document length.
* It does the following checks:
* <ul>
* <li>scores are non-negative and finite.
* <li>score matches the explanation exactly.
* <li>internal explanations calculations are sane (e.g. sum of: and so on actually compute sums)
* <li>scores don't decrease as term frequencies increase: e.g. score(freq=N + 1) &gt;= score(freq=N)
* <li>scores don't decrease as documents get shorter, e.g. score(len=M) &gt;= score(len=M+1)
* <li>scores don't decrease as terms get rarer, e.g. score(term=N) &gt;= score(term=N+1)
* <li>scoring works for floating point frequencies (e.g. sloppy phrase and span queries will work)
* <li>scoring works for reasonably large 64-bit statistic values (e.g. distributed search will work)
* <li>scoring works for reasonably large boost values (0 .. Integer.MAX_VALUE, e.g. query boosts will work)
* <li>scoring works for parameters randomized within valid ranges (see {@link #getSimilarity(Random)})
* </ul>
*/
public void testRandomScoring() throws Exception {
Random random = random();
final int iterations = atLeast(10);
for (int i = 0; i < iterations; i++) {
// pull a new similarity to switch up parameters
Similarity similarity = getSimilarity(random);
for (int j = 0; j < 10; j++) {
// for each norm value...
for (int k = 0; k < NORM_VALUES.size(); k++) {
CollectionStatistics corpus = newCorpus(random, k);
for (int l = 0; l < 10; l++) {
TermStatistics term = newTerm(random, corpus);
final float freq;
if (term.totalTermFreq() == -1) {
// omit TF
freq = 1;
} else if (term.docFreq() == 1) {
// only one document, all the instances must be here.
freq = Math.toIntExact(term.totalTermFreq());
} else {
// there is at least one other document, and those must have at least 1 instance each.
int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
if (random.nextBoolean()) {
freq = TestUtil.nextInt(random, 1, upperBound);
} else {
float freqCandidate = upperBound * random.nextFloat();
// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
if (freqCandidate <= Float.MIN_VALUE) {
freqCandidate = Math.nextUp(Float.MIN_VALUE);
}
freq = freqCandidate;
}
}
// we just limit the test to "reasonable" boost values but don't enforce this anywhere.
// too big, and you are asking for overflow. that's hard for a sim to enforce (but definitely possible)
// for now, we just want to detect overflow where its a real bug/hazard in the computation with reasonable inputs.
final float boost;
switch (random.nextInt(5)) {
case 0:
// minimum value (not enforced)
boost = 0F;
break;
case 1:
// tiny value
boost = Float.MIN_VALUE;
break;
case 2:
// no-op value (sometimes treated special in explanations)
boost = 1F;
break;
case 3:
// maximum value (not enforceD)
boost = Integer.MAX_VALUE;
break;
default:
// random value
boost = random.nextFloat() * Integer.MAX_VALUE;
break;
}
doTestScoring(similarity, corpus, term, boost, freq, k);
}
}
}
}
}
/** runs for a single test case, so that if you hit a test failure you can write a reproducer just for that scenario */
private static void doTestScoring(Similarity similarity, CollectionStatistics corpus, TermStatistics term, float boost, float freq, int norm) throws IOException {
boolean success = false;
SimWeight weight = similarity.computeWeight(boost, corpus, term);
SimScorer scorer = similarity.simScorer(weight, NORM_VALUES.get(norm).getContext());
try {
float score = scorer.score(0, freq);
// check that score isn't infinite or negative
assertTrue("infinite/NaN score: " + score, Float.isFinite(score));
assertTrue("negative score: " + score, score >= 0);
// check explanation matches
Explanation explanation = scorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
if (score != explanation.getValue()) {
fail("expected: " + score + ", got: " + explanation);
}
CheckHits.verifyExplanation("<test query>", 0, score, true, explanation);
// check score(freq-1), given the same norm it should be <= score(freq) [scores non-decreasing for more term occurrences]
final float prevFreq;
if (random().nextBoolean() && freq == (int)freq && freq > 1 && term.docFreq() > 1) {
// previous in integer space
prevFreq = freq - 1;
} else {
// previous in float space (e.g. for sloppyPhrase)
prevFreq = Math.nextDown(freq);
}
float prevScore = scorer.score(0, prevFreq);
// check that score isn't infinite or negative
assertTrue(Float.isFinite(prevScore));
assertTrue(prevScore >= 0);
// check explanation matches
Explanation prevExplanation = scorer.explain(0, Explanation.match(prevFreq, "freq, occurrences of term within document"));
if (prevScore != prevExplanation.getValue()) {
fail("expected: " + prevScore + ", got: " + prevExplanation);
}
CheckHits.verifyExplanation("test query (prevFreq)", 0, prevScore, true, prevExplanation);
if (prevScore > score) {
System.out.println(prevExplanation);
System.out.println(explanation);
fail("score(" + prevFreq + ")=" + prevScore + " > score(" + freq + ")=" + score);
}
// check score(norm-1), given the same freq it should be >= score(norm) [scores non-decreasing as docs get shorter]
if (norm > 1) {
SimScorer prevNormScorer = similarity.simScorer(weight, NORM_VALUES.get(norm - 1).getContext());
float prevNormScore = prevNormScorer.score(0, freq);
// check that score isn't infinite or negative
assertTrue(Float.isFinite(prevNormScore));
assertTrue(prevNormScore >= 0);
// check explanation matches
Explanation prevNormExplanation = prevNormScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
if (prevNormScore != prevNormExplanation.getValue()) {
fail("expected: " + prevNormScore + ", got: " + prevNormExplanation);
}
CheckHits.verifyExplanation("test query (prevNorm)", 0, prevNormScore, true, prevNormExplanation);
if (prevNormScore < score) {
System.out.println(prevNormExplanation);
System.out.println(explanation);
fail("score(" + freq + "," + (norm-1) + ")=" + prevNormScore + " < score(" + freq + "," + norm + ")=" + score);
}
}
// check score(term-1), given the same freq/norm it should be >= score(term) [scores non-decreasing as terms get rarer]
if (term.docFreq() > 1 && (term.totalTermFreq() == -1 || freq < term.totalTermFreq())) {
final long prevTotalTermFreq;
if (term.totalTermFreq() == -1) {
prevTotalTermFreq = -1;
} else {
prevTotalTermFreq = term.totalTermFreq() - 1;
}
TermStatistics prevTerm = new TermStatistics(term.term(), term.docFreq() - 1, prevTotalTermFreq);
SimWeight prevWeight = similarity.computeWeight(boost, corpus, term);
SimScorer prevTermScorer = similarity.simScorer(prevWeight, NORM_VALUES.get(norm).getContext());
float prevTermScore = prevTermScorer.score(0, freq);
// check that score isn't infinite or negative
assertTrue(Float.isFinite(prevTermScore));
assertTrue(prevTermScore >= 0);
// check explanation matches
Explanation prevTermExplanation = prevTermScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document"));
if (prevTermScore != prevTermExplanation.getValue()) {
fail("expected: " + prevTermScore + ", got: " + prevTermExplanation);
}
CheckHits.verifyExplanation("test query (prevTerm)", 0, prevTermScore, true, prevTermExplanation);
if (prevTermScore < score) {
System.out.println(prevTermExplanation);
System.out.println(explanation);
fail("score(" + freq + "," + (prevTerm) + ")=" + prevTermScore + " < score(" + freq + "," + term + ")=" + score);
}
}
success = true;
} finally {
if (!success) {
System.out.println(similarity);
System.out.println(corpus);
System.out.println(term);
if (norm == 0) {
System.out.println("norms=omitted");
} else {
System.out.println("norm=" + norm + " (doc length ~ " + SmallFloat.byte4ToInt((byte) norm) + ")");
}
System.out.println("freq=" + freq);
}
}
}
}

View File

@ -434,7 +434,7 @@ public final class TestUtil {
/** start and end are BOTH inclusive */
public static long nextLong(Random r, long start, long end) {
assert end >= start;
assert end >= start : "start=" + start + ",end=" + end;
final BigInteger range = BigInteger.valueOf(end).add(BigInteger.valueOf(1)).subtract(BigInteger.valueOf(start));
if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) <= 0) {
return start + r.nextInt(range.intValue());