LUCENE-7730: Better accuracy for the length normalization factor.

This commit is contained in:
Adrien Grand 2017-05-18 16:27:31 +02:00
parent c53d19e7b2
commit 06a6034d9b
45 changed files with 1351 additions and 1313 deletions

View File

@ -63,6 +63,9 @@ Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default
codec. (Adrien Grand)
* LUCENE-7730: More accurate encoding of the length normalization factor
thanks to the removal of index-time boosts. (Adrien Grand)
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both

View File

@ -603,7 +603,7 @@ final class DefaultIndexingChain extends DocConsumer {
// PerField.invert to allow for later downgrading of the index options:
fi.setIndexOptions(fieldType.indexOptions());
fp = new PerField(fi, invert);
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
@ -633,6 +633,7 @@ final class DefaultIndexingChain extends DocConsumer {
/** NOTE: not static: accesses at least docState, termsHash. */
private final class PerField implements Comparable<PerField> {
final int indexCreatedVersionMajor;
final FieldInfo fieldInfo;
final Similarity similarity;
@ -659,7 +660,8 @@ final class DefaultIndexingChain extends DocConsumer {
// reused
TokenStream tokenStream;
public PerField(FieldInfo fieldInfo, boolean invert) {
public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.fieldInfo = fieldInfo;
similarity = docState.similarity;
if (invert) {
@ -668,7 +670,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
void setInvertState() {
invertState = new FieldInvertState(fieldInfo.name);
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;

View File

@ -193,6 +193,10 @@ class DocumentsWriterPerThread {
return fieldInfos;
}
public int getIndexCreatedVersionMajor() {
return indexWriter.segmentInfos.getIndexCreatedVersionMajor();
}
final void testPoint(String message) {
if (enableTestPoints) {
assert infoStream.isEnabled("TP"); // don't enable unless you need them.

View File

@ -31,7 +31,8 @@ import org.apache.lucene.util.AttributeSource;
* @lucene.experimental
*/
public final class FieldInvertState {
String name;
final int indexCreatedVersionMajor;
final String name;
int position;
int length;
int numOverlap;
@ -50,14 +51,15 @@ public final class FieldInvertState {
/** Creates {code FieldInvertState} for the specified
* field name. */
public FieldInvertState(String name) {
public FieldInvertState(int indexCreatedVersionMajor, String name) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.name = name;
}
/** Creates {code FieldInvertState} for the specified
* field name and values for all fields. */
public FieldInvertState(String name, int position, int length, int numOverlap, int offset) {
this.name = name;
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
this(indexCreatedVersionMajor, name);
this.position = position;
this.length = length;
this.numOverlap = numOverlap;
@ -164,4 +166,11 @@ public final class FieldInvertState {
public String getName() {
return name;
}
/**
* Return the version that was used to create the index, or 6 if it was created before 7.0.
*/
public int getIndexCreatedVersionMajor() {
return indexCreatedVersionMajor;
}
}

View File

@ -96,20 +96,6 @@ public class BM25Similarity extends Similarity {
}
}
/** The default implementation encodes <code>1 / sqrt(length)</code>
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
* Lucene's historic implementation: {@link ClassicSimilarity}. If you
* change this, then you should change {@link #decodeNormValue(byte)} to match. */
protected byte encodeNormValue(int fieldLength) {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(fieldLength)));
}
/** The default implementation returns <code>1 / f<sup>2</sup></code>
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
protected float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF];
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
@ -132,21 +118,31 @@ public class BM25Similarity extends Similarity {
}
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
private static final float[] OLD_LENGTH_TABLE = new float[256];
private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (f*f);
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
}
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
for (int i = 0; i < 256; i++) {
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
return encodeNormValue(numTerms);
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
if (indexCreatedVersionMajor >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
}
}
/**
@ -207,34 +203,43 @@ public class BM25Similarity extends Similarity {
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];
float[] oldCache = new float[256];
float[] cache = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
BM25Stats bm25stats = (BM25Stats) stats;
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
}
private class BM25DocScorer extends SimScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final NumericDocValues norms;
/** precomputed cache for all length values */
private final float[] lengthCache;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1);
this.cache = stats.cache;
this.norms = norms;
if (indexCreatedVersionMajor >= 7) {
lengthCache = LENGTH_TABLE;
cache = stats.cache;
} else {
lengthCache = OLD_LENGTH_TABLE;
cache = stats.oldCache;
}
}
@Override
@ -245,7 +250,7 @@ public class BM25Similarity extends Similarity {
norm = k1;
} else {
if (norms.advanceExact(doc)) {
norm = cache[(byte)norms.longValue() & 0xFF];
norm = cache[((byte) norms.longValue()) & 0xFF];
} else {
norm = cache[0];
}
@ -255,7 +260,7 @@ public class BM25Similarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms);
return explainScore(doc, freq, stats, norms, lengthCache);
}
@Override
@ -281,21 +286,23 @@ public class BM25Similarity extends Similarity {
private final float weight;
/** field name, for pulling norms */
private final String field;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float cache[];
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
* for both OLD_LENGTH_TABLE and LENGTH_TABLE */
private final float[] oldCache, cache;
BM25Stats(String field, float boost, Explanation idf, float avgdl, float cache[]) {
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
this.field = field;
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.cache = cache;
this.weight = idf.getValue() * boost;
this.oldCache = oldCache;
this.cache = cache;
}
}
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "parameter k1"));
@ -311,7 +318,7 @@ public class BM25Similarity extends Similarity {
} else {
norm = 0;
}
float doclen = decodeNormValue(norm);
float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "parameter b"));
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
subs.add(Explanation.match(doclen, "fieldLength"));
@ -321,13 +328,13 @@ public class BM25Similarity extends Similarity {
}
}
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
Explanation boostExpl = Explanation.match(stats.boost, "boost");
List<Explanation> subs = new ArrayList<>();
if (boostExpl.getValue() != 1.0f)
subs.add(boostExpl);
subs.add(stats.idf);
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms);
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
subs.add(tfNormExpl);
return Explanation.match(
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),

View File

@ -17,91 +17,27 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/**
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
* encodes} norm values as a single byte before being stored. At search time,
* the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
* instance, <i>decode(encode(0.89)) = 0.875</i>.
* <p>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for all
* documents - are maintained in memory.
* <p>
* The rationale supporting such lossy compression of norm values is that given
* the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter. <br>
* &nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of
* scoring, e.g. by using a different {@link Similarity} for search.
* Expert: Historical scoring implementation. You might want to consider using
* {@link BM25Similarity} instead, which is generally considered superior to
* TF-IDF.
*/
public class ClassicSimilarity extends TFIDFSimilarity {
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/** Sole constructor: parameter-free */
public ClassicSimilarity() {}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
* #setDiscountOverlaps} is false, else it's {@link
* FieldInvertState#getLength()} - {@link
* FieldInvertState#getNumOverlap()}.
* <code>1/sqrt(length)</code>.
*
* @lucene.experimental */
@Override
public float lengthNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
public float lengthNorm(int numTerms) {
return (float) (1.0 / Math.sqrt(numTerms));
}
@ -138,33 +74,6 @@ public class ClassicSimilarity extends TFIDFSimilarity {
public float idf(long docFreq, long docCount) {
return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {

View File

@ -190,7 +190,8 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
@ -198,12 +199,12 @@ public abstract class SimilarityBase extends Similarity {
SimScorer subScorers[] = new SimScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
BasicStats basicstats = (BasicStats) subStats[i];
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
return new MultiSimilarity.MultiSimScorer(subScorers);
} else {
BasicStats basicstats = (BasicStats) stats;
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
}
@ -216,40 +217,38 @@ public abstract class SimilarityBase extends Similarity {
// ------------------------------ Norm handling ------------------------------
/** Norm to document length map. */
private static final float[] NORM_TABLE = new float[256];
/** Cache of decoded bytes. */
private static final float[] OLD_LENGTH_TABLE = new float[256];
private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
float f = SmallFloat.byte315ToFloat((byte)i);
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
}
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
for (int i = 0; i < 256; i++) {
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
/** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
public long computeNorm(FieldInvertState state) {
final float numTerms;
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
return encodeNormValue(numTerms);
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
if (indexCreatedVersionMajor >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
}
}
/** Decodes a normalization factor (document length) stored in an index.
* @see #encodeNormValue(float)
*/
protected float decodeNormValue(byte norm) {
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes the length to a byte via SmallFloat. */
protected byte encodeNormValue(float length) {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(length)));
}
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */
@ -266,35 +265,37 @@ public abstract class SimilarityBase extends Similarity {
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
* respectively.
*/
private class BasicSimScorer extends SimScorer {
final class BasicSimScorer extends SimScorer {
private final BasicStats stats;
private final NumericDocValues norms;
private final float[] normCache;
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.norms = norms;
this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
}
private float getNormValue(int doc) throws IOException {
float getLengthValue(int doc) throws IOException {
if (norms == null) {
return 1F;
}
if (norms.advanceExact(doc)) {
return decodeNormValue((byte) norms.longValue());
return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
} else {
return decodeNormValue((byte) 0);
return 0;
}
}
@Override
public float score(int doc, float freq) throws IOException {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq, getNormValue(doc));
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return SimilarityBase.this.explain(stats, doc, freq, getNormValue(doc));
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
}
@Override

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/**
@ -233,11 +234,6 @@ import org.apache.lucene.util.BytesRef;
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
@ -379,13 +375,49 @@ import org.apache.lucene.util.BytesRef;
* @see IndexSearcher#setSimilarity(Similarity)
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Cache of decoded bytes. */
static final float[] OLD_NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public TFIDFSimilarity() {}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(long, long)}
* factor for each term in the query and these products are then summed to
@ -471,30 +503,25 @@ public abstract class TFIDFSimilarity extends Similarity {
/**
* Compute an index-time normalization value for this field instance.
* <p>
* This value will be stored in a single byte lossy representation by
* {@link #encodeNormValue(float)}.
*
* @param state statistics of the current field (such as length, boost, etc)
* @return an index-time normalization value
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) discounting overlaps}
* @return a length normalization value
*/
public abstract float lengthNorm(FieldInvertState state);
public abstract float lengthNorm(int length);
@Override
public final long computeNorm(FieldInvertState state) {
float normValue = lengthNorm(state);
return encodeNormValue(normValue);
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
if (state.getIndexCreatedVersionMajor() >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315(lengthNorm(numTerms));
}
}
/**
* Decodes a normalization factor stored in an index.
*
* @see #encodeNormValue(float)
*/
public abstract float decodeNormValue(long norm);
/** Encodes a normalization factor for storage in an index. */
public abstract long encodeNormValue(float f);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
@ -529,24 +556,41 @@ public abstract class TFIDFSimilarity extends Similarity {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(collectionStats.field(), boost, idf);
float[] normTable = new float[256];
for (int i = 1; i < 256; ++i) {
int length = SmallFloat.byte4ToInt((byte) i);
float norm = lengthNorm(length);
normTable[i] = norm;
}
normTable[0] = 1f / normTable[255];
return new IDFStats(collectionStats.field(), boost, idf, normTable);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
IDFStats idfstats = (IDFStats) stats;
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
final float[] normTable;
if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
normTable = idfstats.normTable;
} else {
// the norm is directly encoded in the index
normTable = OLD_NORM_TABLE;
}
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
}
private final class TFIDFSimScorer extends SimScorer {
private final IDFStats stats;
private final float weightValue;
private final NumericDocValues norms;
private final float[] normTable;
TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
this.stats = stats;
this.weightValue = stats.queryWeight;
this.norms = norms;
this.normTable = normTable;
}
@Override
@ -556,13 +600,13 @@ public abstract class TFIDFSimilarity extends Similarity {
if (norms == null) {
return raw;
} else {
long normValue;
float normValue;
if (norms.advanceExact(doc)) {
normValue = norms.longValue();
normValue = normTable[(int) (norms.longValue() & 0xFF)];
} else {
normValue = 0;
}
return raw * decodeNormValue(normValue); // normalize for field
return raw * normValue; // normalize for field
}
}
@ -578,35 +622,39 @@ public abstract class TFIDFSimilarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms);
return explainScore(doc, freq, stats, norms, normTable);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
private static class IDFStats extends SimWeight {
static class IDFStats extends SimWeight {
private final String field;
/** The idf and its explanation */
private final Explanation idf;
private final float boost;
private final float queryWeight;
final float[] normTable;
public IDFStats(String field, float boost, Explanation idf) {
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.boost = boost;
this.queryWeight = boost * idf.getValue();
this.normTable = normTable;
}
}
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
float norm;
if (norms != null && norms.advanceExact(doc)) {
norm = decodeNormValue(norms.longValue());
} else {
if (norms == null) {
norm = 1f;
} else if (norms.advanceExact(doc) == false) {
norm = 0f;
} else {
norm = normTable[(int) (norms.longValue() & 0xFF)];
}
Explanation fieldNormExpl = Explanation.match(
@ -619,9 +667,9 @@ public abstract class TFIDFSimilarity extends Similarity {
tfExplanation, stats.idf, fieldNormExpl);
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation queryExpl = Explanation.match(stats.boost, "boost");
Explanation fieldExpl = explainField(doc, freq, stats, norms);
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
if (stats.boost == 1f) {
return fieldExpl;
}

View File

@ -97,31 +97,74 @@ public class SmallFloat {
return Float.intBitsToFloat(bits);
}
/** floatToByte(b, mantissaBits=5, zeroExponent=2)
* <br>smallest nonzero value = 0.033203125
* <br>largest value = 1984.0
* <br>epsilon = 0.03125
*/
public static byte floatToByte52(float f) {
int bits = Float.floatToRawIntBits(f);
int smallfloat = bits >> (24-5);
if (smallfloat <= (63-2)<<5) {
return (bits<=0) ? (byte)0 : (byte)1;
/** Float-like encoding for positive longs that preserves ordering and 4 significant bits. */
public static int longToInt4(long i) {
if (i < 0) {
throw new IllegalArgumentException("Only supports positive values, got " + i);
}
if (smallfloat >= ((63-2)<<5) + 0x100) {
return -1;
int numBits = 64 - Long.numberOfLeadingZeros(i);
if (numBits < 4) {
// subnormal value
return Math.toIntExact(i);
} else {
// normal value
int shift = numBits - 4;
// only keep the 5 most significant bits
int encoded = Math.toIntExact(i >>> shift);
// clear the most significant bit, which is implicit
encoded &= 0x07;
// encode the shift, adding 1 because 0 is reserved for subnormal values
encoded |= (shift + 1) << 3;
return encoded;
}
return (byte)(smallfloat - ((63-2)<<5));
}
/** byteToFloat(b, mantissaBits=5, zeroExponent=2) */
public static float byte52ToFloat(byte b) {
// on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup
// is only a little bit faster (anywhere from 0% to 7%)
if (b == 0) return 0.0f;
int bits = (b&0xff) << (24-5);
bits += (63-2) << 24;
return Float.intBitsToFloat(bits);
/**
* Decode values encoded with {@link #longToInt4(long)}.
*/
public static final long int4ToLong(int i) {
long bits = i & 0x07;
int shift = (i >>> 3) - 1;
long decoded;
if (shift == -1) {
// subnormal value
decoded = bits;
} else {
// normal value
decoded = (bits | 0x08) << shift;
}
return decoded;
}
private static final int MAX_INT4 = longToInt4(Integer.MAX_VALUE);
private static final int NUM_FREE_VALUES = 255 - MAX_INT4;
/**
* Encode an integer to a byte. It is built upon {@link #longToInt4(long)}
* and leverages the fact that {@code longToInt4(Integer.MAX_VALUE)} is
* less than 255 to encode low values more accurately.
*/
public static byte intToByte4(int i) {
if (i < 0) {
throw new IllegalArgumentException("Only supports positive values, got " + i);
}
if (i < NUM_FREE_VALUES) {
return (byte) i;
} else {
return (byte) (NUM_FREE_VALUES + longToInt4(i - NUM_FREE_VALUES));
}
}
/**
* Decode values that have been encoded with {@link #intToByte4(int)}.
*/
public static int byte4ToInt(byte b) {
int i = Byte.toUnsignedInt(b);
if (i < NUM_FREE_VALUES) {
return i;
} else {
long decoded = NUM_FREE_VALUES + int4ToLong(i - NUM_FREE_VALUES);
return Math.toIntExact(decoded);
}
}
}

View File

@ -2441,7 +2441,7 @@ public class TestIndexSorting extends LuceneTestCase {
assertTrue(sparseValues.advanceExact(docID));
assertTrue(sparseBinaryValues.advanceExact(docID));
assertTrue(normsValues.advanceExact(docID));
assertEquals(124, normsValues.longValue());
assertEquals(1, normsValues.longValue());
assertEquals(127-docID, (int) sparseValues.longValue());
assertEquals(new BytesRef(Integer.toString(127-docID)), sparseBinaryValues.binaryValue());
} else {

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -26,7 +27,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -35,12 +38,12 @@ import org.apache.lucene.util.TestUtil;
/**
* Tests the maxTermFrequency statistic in FieldInvertState
*/
public class TestMaxTermFrequency extends LuceneTestCase {
public class TestMaxTermFrequency extends LuceneTestCase {
Directory dir;
IndexReader reader;
/* expected maxTermFrequency values for our documents */
ArrayList<Integer> expected = new ArrayList<>();
@Override
public void setUp() throws Exception {
super.setUp();
@ -59,14 +62,14 @@ public class TestMaxTermFrequency extends LuceneTestCase {
reader = writer.getReader();
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
super.tearDown();
}
public void test() throws Exception {
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++) {
@ -95,30 +98,42 @@ public class TestMaxTermFrequency extends LuceneTestCase {
Collections.shuffle(terms, random());
return Arrays.toString(terms.toArray(new String[terms.size()]));
}
/**
* Simple similarity that encodes maxTermFrequency directly as a byte
*/
static class TestSimilarity extends TFIDFSimilarity {
static class TestSimilarity extends Similarity {
@Override
public float lengthNorm(FieldInvertState state) {
public long computeNorm(FieldInvertState state) {
return state.getMaxTermFrequency();
}
@Override
public long encodeNormValue(float f) {
return (byte) f;
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}
@Override
public float decodeNormValue(long norm) {
return norm;
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
return 0;
}
@Override
public float computeSlopFactor(int distance) {
return 0;
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 0;
}
};
}
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
}

View File

@ -32,13 +32,11 @@ import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
@ -49,67 +47,6 @@ import org.apache.lucene.util.TestUtil;
@Slow
public class TestNorms extends LuceneTestCase {
static final String BYTE_TEST_FIELD = "normsTestByte";
static class CustomNormEncodingSimilarity extends TFIDFSimilarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float lengthNorm(FieldInvertState state) {
return state.getLength();
}
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
// LUCENE-1260
public void testCustomEncoder() throws Exception {
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig config = newIndexWriterConfig(analyzer);
config.setSimilarity(new CustomNormEncodingSimilarity());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
Document doc = new Document();
Field foo = newTextField("foo", "", Field.Store.NO);
Field bar = newTextField("bar", "", Field.Store.NO);
doc.add(foo);
doc.add(bar);
for (int i = 0; i < 100; i++) {
bar.setStringValue("singleton");
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, fooNorms.nextDoc());
assertEquals(0, fooNorms.longValue());
}
NumericDocValues barNorms = MultiDocValues.getNormValues(reader, "bar");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, barNorms.nextDoc());
assertEquals(1, barNorms.longValue());
}
reader.close();
dir.close();
}
public void testMaxByteNorms() throws IOException {
Directory dir = newFSDirectory(createTempDir("TestNorms.testMaxByteNorms"));

View File

@ -44,9 +44,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestOmitTf extends LuceneTestCase {
public static class SimpleSimilarity extends TFIDFSimilarity {
@Override public float decodeNormValue(long norm) { return norm; }
@Override public long encodeNormValue(float f) { return (long) f; }
@Override public float lengthNorm(FieldInvertState state) { return 1; }
@Override public float lengthNorm(int length) { return 1; }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long docCount) { return 1.0f; }

View File

@ -30,7 +30,6 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -72,7 +71,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
}
@Override
public float lengthNorm(FieldInvertState state) {
public float lengthNorm(int length) {
// Disable length norm
return 1;
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FieldValueHitQueue.Entry;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@ -63,7 +64,7 @@ public class TestElevationComparator extends LuceneTestCase {
writer.close();
IndexSearcher searcher = newSearcher(r);
searcher.setSimilarity(new ClassicSimilarity());
searcher.setSimilarity(new BM25Similarity());
runTest(searcher, true);
runTest(searcher, false);
@ -98,11 +99,11 @@ public class TestElevationComparator extends LuceneTestCase {
assertEquals(3, topDocs.scoreDocs[1].doc);
if (reversed) {
assertEquals(2, topDocs.scoreDocs[2].doc);
assertEquals(1, topDocs.scoreDocs[3].doc);
} else {
assertEquals(1, topDocs.scoreDocs[2].doc);
assertEquals(2, topDocs.scoreDocs[3].doc);
} else {
assertEquals(2, topDocs.scoreDocs[2].doc);
assertEquals(1, topDocs.scoreDocs[3].doc);
}
/*

View File

@ -37,6 +37,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -309,7 +310,7 @@ public class TestPhraseQuery extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(new MockAnalyzer(random()))
.setMergePolicy(newLogMergePolicy())
.setSimilarity(new ClassicSimilarity()));
.setSimilarity(new BM25Similarity()));
Document doc = new Document();
doc.add(newTextField("field", "foo firstname lastname foo", Field.Store.YES));
@ -335,9 +336,9 @@ public class TestPhraseQuery extends LuceneTestCase {
// each other get a higher score:
assertEquals(1.0, hits[0].score, 0.01);
assertEquals(0, hits[0].doc);
assertEquals(0.62, hits[1].score, 0.01);
assertEquals(0.63, hits[1].score, 0.01);
assertEquals(1, hits[1].doc);
assertEquals(0.43, hits[2].score, 0.01);
assertEquals(0.47, hits[2].score, 0.01);
assertEquals(2, hits[2].doc);
QueryUtils.check(random(), query,searcher);
reader.close();

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -49,9 +50,14 @@ public class TestQueryRescorer extends LuceneTestCase {
return searcher;
}
public static IndexWriterConfig newIndexWriterConfig() {
// We rely on more tokens = lower score:
return LuceneTestCase.newIndexWriterConfig().setSimilarity(new ClassicSimilarity());
}
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
@ -106,7 +112,7 @@ public class TestQueryRescorer extends LuceneTestCase {
// Test LUCENE-5682
public void testNullScorerTermQuery() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
@ -145,7 +151,7 @@ public class TestQueryRescorer extends LuceneTestCase {
public void testCustomCombine() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
@ -196,7 +202,7 @@ public class TestQueryRescorer extends LuceneTestCase {
public void testExplain() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
@ -271,7 +277,7 @@ public class TestQueryRescorer extends LuceneTestCase {
public void testMissingSecondPassScore() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
@ -325,7 +331,7 @@ public class TestQueryRescorer extends LuceneTestCase {
public void testRandom() throws Exception {
Directory dir = newDirectory();
int numDocs = atLeast(1000);
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
final int[] idToNum = new int[numDocs];
int maxValue = TestUtil.nextInt(random(), 10, 1000000);

View File

@ -17,20 +17,18 @@
package org.apache.lucene.search;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.LuceneTestCase;
/** Similarity unit test.
*
@ -39,7 +37,7 @@ import org.apache.lucene.document.Document;
public class TestSimilarity extends LuceneTestCase {
public static class SimpleSimilarity extends ClassicSimilarity {
@Override public float lengthNorm(FieldInvertState state) { return 1; }
@Override public float lengthNorm(int length) { return 1; }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long docCount) { return 1.0f; }

View File

@ -17,19 +17,21 @@
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -38,7 +40,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
private Directory directory;
private DirectoryReader reader;
private IndexSearcher searcher;
@Override
public void setUp() throws Exception {
super.setUp();
@ -51,7 +53,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
doc.add(field);
Field field2 = newTextField("bar", "", Field.Store.NO);
doc.add(field2);
field.setStringValue("quick brown fox");
field2.setStringValue("quick brown fox");
iw.addDocument(doc);
@ -63,14 +65,14 @@ public class TestSimilarityProvider extends LuceneTestCase {
searcher = newSearcher(reader);
searcher.setSimilarity(sim);
}
@Override
public void tearDown() throws Exception {
reader.close();
directory.close();
super.tearDown();
}
public void testBasics() throws Exception {
// sanity check of norms writer
// TODO: generalize
@ -81,7 +83,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
assertEquals(i, barNorms.nextDoc());
assertFalse(fooNorms.longValue() == barNorms.longValue());
}
// sanity check of searching
TopDocs foodocs = searcher.search(new TermQuery(new Term("foo", "brown")), 10);
assertTrue(foodocs.totalHits > 0);
@ -89,11 +91,11 @@ public class TestSimilarityProvider extends LuceneTestCase {
assertTrue(bardocs.totalHits > 0);
assertTrue(foodocs.scoreDocs[0].score < bardocs.scoreDocs[0].score);
}
private static class ExampleSimilarityProvider extends PerFieldSimilarityWrapper {
private Similarity sim1 = new Sim1();
private Similarity sim2 = new Sim2();
@Override
public Similarity get(String field) {
if (field.equals("foo")) {
@ -103,80 +105,73 @@ public class TestSimilarityProvider extends LuceneTestCase {
}
}
}
private static class Sim1 extends TFIDFSimilarity {
private static class Sim1 extends Similarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
public long computeNorm(FieldInvertState state) {
return 1;
}
@Override
public float lengthNorm(FieldInvertState state) {
return 1f;
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}
@Override
public float sloppyFreq(int distance) {
return 1f;
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
return 1;
}
@Override
public float computeSlopFactor(int distance) {
return 1;
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1;
}
};
}
@Override
public float tf(float freq) {
return 1f;
}
@Override
public float idf(long docFreq, long docCount) {
return 1f;
}
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1f;
}
}
private static class Sim2 extends TFIDFSimilarity {
private static class Sim2 extends Similarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float lengthNorm(FieldInvertState state) {
return 10f;
public long computeNorm(FieldInvertState state) {
return 10;
}
@Override
public float sloppyFreq(int distance) {
return 10f;
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}
@Override
public float tf(float freq) {
return 10f;
}
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float idf(long docFreq, long docCount) {
return 10f;
}
@Override
public float score(int doc, float freq) throws IOException {
return 10;
}
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1f;
@Override
public float computeSlopFactor(int distance) {
return 1;
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1;
}
};
}
}
}

View File

@ -42,7 +42,7 @@ public class TestSortRescorer extends LuceneTestCase {
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setSimilarity(new ClassicSimilarity()));
Document doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));

View File

@ -20,19 +20,6 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestAxiomaticSimilarity extends LuceneTestCase {
public void testSaneNormValues() {
Axiomatic sim = new AxiomaticF2EXP();
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
assertFalse("nan len for byte=" + i, Float.isNaN(len));
if (i > 0) {
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte) (i - 1)));
}
}
}
public void testIllegalS() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new AxiomaticF2EXP(Float.POSITIVE_INFINITY, 0.1f);

View File

@ -17,23 +17,27 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
public class TestBM25Similarity extends LuceneTestCase {
public void testSaneNormValues() {
BM25Similarity sim = new BM25Similarity();
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
assertFalse("nan len for byte=" + i, Float.isNaN(len));
if (i > 0) {
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
}
}
}
public void testIllegalK1() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new BM25Similarity(Float.POSITIVE_INFINITY, 0.75f);
@ -72,4 +76,44 @@ public class TestBM25Similarity extends LuceneTestCase {
});
assertTrue(expected.getMessage().contains("illegal b value"));
}
public void testLengthEncodingBackwardCompatibility() throws IOException {
Similarity similarity = new BM25Similarity();
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
Explanation docLen = findExplanation(expl, "fieldLength");
assertNotNull(docLen);
assertEquals(docLen.toString(), length, (int) docLen.getValue());
w.close();
reader.close();
dir.close();
}
}
}
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().equals(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
}
return null;
}
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestBooleanSimilarity extends LuceneTestCase {
@ -105,8 +106,8 @@ public class TestBooleanSimilarity extends LuceneTestCase {
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(50);
FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),

View File

@ -19,24 +19,34 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestClassicSimilarity extends LuceneTestCase {
private Directory directory;
@ -63,14 +73,6 @@ public class TestClassicSimilarity extends LuceneTestCase {
IOUtils.close(indexReader, directory);
super.tearDown();
}
// Javadocs give this as an example so we test to make sure it's correct:
public void testPrecisionLoss() throws Exception {
ClassicSimilarity sim = new ClassicSimilarity();
float v = sim.decodeNormValue(sim.encodeNormValue(.89f));
assertEquals(0.875f, v, 0.0001f);
}
public void testHit() throws IOException {
Query query = new TermQuery(new Term("test", "hit"));
@ -159,16 +161,83 @@ public class TestClassicSimilarity extends LuceneTestCase {
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testSaneNormValues() {
public void testSaneNormValues() throws IOException {
ClassicSimilarity sim = new ClassicSimilarity();
for (int i = 0; i < 256; i++) {
float boost = sim.decodeNormValue((byte) i);
float boost = TFIDFSimilarity.OLD_NORM_TABLE[i];
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
if (i > 0) {
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > TFIDFSimilarity.OLD_NORM_TABLE[i-1]);
}
}
TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, new IndexSearcher(new MultiReader()).collectionStatistics("foo"));
for (int i = 0; i < 256; i++) {
float boost = stats.normTable[i];
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
if (i > 0) {
assertTrue("boost is not decreasing: " + boost + ",byte=" + i, boost < stats.normTable[i-1]);
}
}
}
public void testNormEncodingBackwardCompatibility() throws IOException {
Similarity similarity = new ClassicSimilarity();
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
for (int length : new int[] {1, 4, 16 }) { // these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
Explanation fieldNorm = findExplanation(expl, "fieldNorm");
assertNotNull(fieldNorm);
assertEquals(fieldNorm.toString(), 1/Math.sqrt(length), fieldNorm.getValue(), 0f);
w.close();
reader.close();
dir.close();
}
}
}
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().startsWith(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
}
return null;
}
public void testSameNormsAsBM25() {
ClassicSimilarity sim1 = new ClassicSimilarity();
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(true);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 1000);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),
0f);
}
}
}

View File

@ -20,16 +20,23 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@ -37,9 +44,13 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity.SimWeight;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
/**
* Tests the {@link SimilarityBase}-based Similarities. Contains unit tests and
@ -586,11 +597,11 @@ public class TestSimilarityBase extends LuceneTestCase {
// LUCENE-5221
public void testDiscountOverlapsBoost() throws IOException {
ClassicSimilarity expected = new ClassicSimilarity();
BM25Similarity expected = new BM25Similarity();
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
expected.setDiscountOverlaps(false);
actual.setDiscountOverlaps(false);
FieldInvertState state = new FieldInvertState("foo");
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo");
state.setLength(5);
state.setNumOverlap(2);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
@ -598,64 +609,32 @@ public class TestSimilarityBase extends LuceneTestCase {
actual.setDiscountOverlaps(true);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
}
public void testSaneNormValues() {
for (SimilarityBase sim : sims) {
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
if (i > 0) {
assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
}
}
}
}
/**
* make sure the similarity does not go crazy when tested against all possible norm values.
*/
public void testCrazyIndexTimeBoosts() throws Exception {
long avgLength = 750;
long docCount = 500000;
long numTokens = docCount * avgLength;
CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
long docFreq = 2000;
long totalTermFreq = 2000 * avgLength;
TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
for (SimilarityBase sim : sims) {
if (sim instanceof IBSimilarity) {
if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
// score goes infinite for tiny doc lengths and negative for huge doc lengths
// TODO: fix this
continue;
}
} else if (sim instanceof DFRSimilarity) {
BasicModel model = ((DFRSimilarity)sim).getBasicModel();
if (model instanceof BasicModelD || model instanceof BasicModelP) {
// score goes NaN for tiny doc lengths
// TODO: fix this
continue;
} else if (model instanceof BasicModelBE) {
// score goes negative infinity for tiny doc lengths
// TODO: fix this
continue;
}
}
BasicStats stats = (BasicStats) sim.computeWeight(1f, collectionStats, termStats);
for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
float score = sim.score(stats, tf, len);
assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
}
public void testLengthEncodingBackwardCompatibility() throws IOException {
Similarity similarity = RandomPicks.randomFrom(random(), sims);
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Term term = new Term("foo", "b");
TermContext context = TermContext.build(reader.getContext(), term);
SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
float docLength = simScorer.getLengthValue(0);
assertEquals(length, (int) docLength);
w.close();
reader.close();
dir.close();
}
}
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.util;
import java.util.Arrays;
public class TestSmallFloat extends LuceneTestCase {
// original lucene byteToFloat
@ -87,10 +89,6 @@ public class TestSmallFloat extends LuceneTestCase {
float f3 = SmallFloat.byte315ToFloat((byte)i);
assertEquals(f1,f2,0.0);
assertEquals(f2,f3,0.0);
float f4 = SmallFloat.byteToFloat((byte)i,5,2);
float f5 = SmallFloat.byte52ToFloat((byte)i);
assertEquals(f4,f5,0.0);
}
}
@ -121,10 +119,51 @@ public class TestSmallFloat extends LuceneTestCase {
byte b3 = SmallFloat.floatToByte315(f);
assertEquals(b1,b2);
assertEquals(b2,b3);
}
}
byte b4 = SmallFloat.floatToByte(f,5,2);
byte b5 = SmallFloat.floatToByte52(f);
assertEquals(b4,b5);
public void testInt4() {
for (int i = 0; i <= 16; ++i) {
// all values in 0-16 are encoded accurately
assertEquals(i, SmallFloat.int4ToLong(SmallFloat.longToInt4(i)));
}
final int maxEncoded = SmallFloat.longToInt4(Long.MAX_VALUE);
for (int i = 1; i < maxEncoded; ++i) {
assertTrue(SmallFloat.int4ToLong(i) > SmallFloat.int4ToLong(i - 1));
}
final int iters = atLeast(1000);
for (int iter = 0; iter < iters; ++iter) {
final long l = TestUtil.nextLong(random(), 0, 1L << TestUtil.nextInt(random(), 5, 61));
int numBits = 64 - Long.numberOfLeadingZeros(l);
long expected = l;
if (numBits > 4) {
long mask = ~0L << (numBits - 4);
expected &= mask;
}
long l2 = SmallFloat.int4ToLong(SmallFloat.longToInt4(l));
assertEquals(expected, l2);
}
}
public void testByte4() {
int[] decoded = new int[256];
for (int b = 0; b < 256; ++b) {
decoded[b] = SmallFloat.byte4ToInt((byte) b);
assertEquals((byte) b, SmallFloat.intToByte4(decoded[b]));
}
for (int i = 1; i < 256; ++i) {
assertTrue(decoded[i] > decoded[i-1]);
}
assertEquals((byte) 255, SmallFloat.intToByte4(Integer.MAX_VALUE));
final int iters = atLeast(1000);
for (int iter = 0; iter < iters; ++iter) {
final int i = random().nextInt(1 << TestUtil.nextInt(random(), 5, 30));
int idx = Arrays.binarySearch(decoded, i);
if (idx < 0) {
idx = -2 - idx;
}
assertTrue(decoded[idx] <= i);
assertEquals((byte) idx, SmallFloat.intToByte4(i));
}
}
@ -146,5 +185,4 @@ public class TestSmallFloat extends LuceneTestCase {
}
}
***/
}

View File

@ -44,7 +44,7 @@ public class TestExpressionRescorer extends LuceneTestCase {
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setSimilarity(new ClassicSimilarity()));
Document doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));

View File

@ -72,6 +72,8 @@ import org.apache.lucene.search.PhraseQuery.Builder;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
@ -147,7 +149,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
CustomScoreQuery query = new CustomScoreQuery(termQuery);
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
@ -199,7 +201,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
query.add(new Term(FIELD_NAME, "very"));
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
@ -271,7 +273,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
};
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);

View File

@ -892,7 +892,7 @@ public class MemoryIndex {
NumericDocValues getNormDocValues() {
if (norm == null) {
FieldInvertState invertState = new FieldInvertState(fieldInfo.name, fieldInfo.number,
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number,
numTokens, numOverlapTokens, 0);
final long value = normSimilarity.computeNorm(invertState);
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);

View File

@ -50,6 +50,7 @@ import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
@ -57,13 +58,16 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@ -145,32 +149,32 @@ public class TestMemoryIndex extends LuceneTestCase {
assertEquals(reader.getTermVectors(0).size(), 1);
}
public void testReaderConsistency() throws IOException {
Analyzer analyzer = new MockPayloadAnalyzer();
// defaults
MemoryIndex mi = new MemoryIndex();
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
// all combinations of offsets/payloads options
mi = new MemoryIndex(true, true);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(true, false);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(false, true);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
mi = new MemoryIndex(false, false);
mi.addField("field", "some terms be here", analyzer);
TestUtil.checkReader(mi.createSearcher().getIndexReader());
analyzer.close();
}
@ -187,11 +191,23 @@ public class TestMemoryIndex extends LuceneTestCase {
float n1 = norms.longValue();
// Norms are re-computed when we change the Similarity
mi.setSimilarity(new ClassicSimilarity() {
mi.setSimilarity(new Similarity() {
@Override
public float lengthNorm(FieldInvertState state) {
public long computeNorm(FieldInvertState state) {
return 74;
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
throw new UnsupportedOperationException();
}
});
norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());

View File

@ -17,7 +17,6 @@
package org.apache.lucene.misc;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.index.FieldInvertState;
/**
* <p>
@ -86,7 +85,7 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
* Sets the default function variables used by lengthNorm when no field
* specific variables have been set.
*
* @see #computeLengthNorm
* @see #lengthNorm
*/
public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) {
this.ln_min = min;
@ -94,25 +93,6 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
this.ln_steep = steepness;
this.discountOverlaps = discountOverlaps;
}
/**
* Implemented as <code> state.getBoost() *
* computeLengthNorm(numTokens) </code> where
* numTokens does not count overlap tokens if
* discountOverlaps is true by default or true for this
* specific field.
*/
@Override
public float lengthNorm(FieldInvertState state) {
final int numTokens;
if (discountOverlaps)
numTokens = state.getLength() - state.getNumOverlap();
else
numTokens = state.getLength();
return computeLengthNorm(numTokens);
}
/**
* Implemented as:
@ -133,7 +113,8 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
* @see #setLengthNormFactors
* @see <a href="doc-files/ss.computeLengthNorm.svg">An SVG visualization of this function</a>
*/
public float computeLengthNorm(int numTerms) {
@Override
public float lengthNorm(int numTerms) {
final int l = ln_min;
final int h = ln_max;
final float s = ln_steep;

View File

@ -16,27 +16,62 @@
*/
package org.apache.lucene.misc;
import java.io.IOException;
import java.util.Collections;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.FieldInvertState;
/**
* Test of the SweetSpotSimilarity
*/
public class SweetSpotSimilarityTest extends LuceneTestCase {
public static float computeAndDecodeNorm(SweetSpotSimilarity decode, Similarity encode, FieldInvertState state) {
return decode.decodeNormValue(computeAndGetNorm(encode, state));
}
public static byte computeAndGetNorm(Similarity s, FieldInvertState state) {
return (byte) s.computeNorm(state);
private static float computeNorm(Similarity sim, String field, int length) throws IOException {
String value = IntStream.range(0, length).mapToObj(i -> "a").collect(Collectors.joining(" "));
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
w.addDocument(Collections.singleton(newTextField(field, value, Store.NO)));
DirectoryReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(sim);
Explanation expl = searcher.explain(new TermQuery(new Term(field, "a")), 0);
reader.close();
dir.close();
Explanation norm = findExplanation(expl, "fieldNorm");
assertNotNull(norm);
return norm.getValue();
}
public void testSweetSpotComputeNorm() {
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().startsWith(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
}
return null;
}
public void testSweetSpotComputeNorm() throws IOException {
final SweetSpotSimilarity ss = new SweetSpotSimilarity();
ss.setLengthNormFactors(1,1,0.5f,true);
@ -46,12 +81,10 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
// base case, should degrade
FieldInvertState invertState = new FieldInvertState("bogus");
for (int i = 1; i < 1000; i++) {
invertState.setLength(i);
assertEquals("base case: i="+i,
computeAndGetNorm(d, invertState),
computeAndGetNorm(s, invertState),
computeNorm(d, "bogus", i),
computeNorm(s, "bogus", i),
0.0f);
}
@ -60,22 +93,19 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
ss.setLengthNormFactors(3,10,0.5f,true);
for (int i = 3; i <=10; i++) {
invertState.setLength(i);
assertEquals("3,10: spot i="+i,
1.0f,
computeAndDecodeNorm(ss, ss, invertState),
computeNorm(ss, "bogus", i),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = computeAndGetNorm(s, invertState);
final float normD = computeNorm(d, "bogus", i - 9);
final float normS = computeNorm(s, "bogus", i);
assertEquals("3,10: 10<x : i="+i,
normD,
normS,
0.0f);
0.01f);
}
@ -106,78 +136,60 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
}
};
invertState = new FieldInvertState("foo");
for (int i = 3; i <=10; i++) {
invertState.setLength(i);
assertEquals("f: 3,10: spot i="+i,
1.0f,
computeAndDecodeNorm(ss, sp, invertState),
computeNorm(sp, "foo", i),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = computeAndGetNorm(sp, invertState);
final float normD = computeNorm(d, "foo", i-9);
final float normS = computeNorm(sp, "foo", i);
assertEquals("f: 3,10: 10<x : i="+i,
normD,
normS,
0.0f);
0.01f);
}
invertState = new FieldInvertState("bar");
for (int i = 8; i <=13; i++) {
invertState.setLength(i);
assertEquals("f: 8,13: spot i="+i,
1.0f,
computeAndDecodeNorm(ss, sp, invertState),
0.0f);
computeNorm(sp, "bar", i),
0.01f);
}
invertState = new FieldInvertState("yak");
for (int i = 6; i <=9; i++) {
invertState.setLength(i);
assertEquals("f: 6,9: spot i="+i,
1.0f,
computeAndDecodeNorm(ss, sp, invertState),
0.0f);
computeNorm(sp, "yak", i),
0.01f);
}
invertState = new FieldInvertState("bar");
for (int i = 13; i < 1000; i++) {
invertState.setLength(i-12);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = computeAndGetNorm(sp, invertState);
final float normD = computeNorm(d, "bar", i-12);
final float normS = computeNorm(sp, "bar", i);
assertEquals("f: 8,13: 13<x : i="+i,
normD,
normS,
0.0f);
0.01f);
}
invertState = new FieldInvertState("yak");
for (int i = 9; i < 1000; i++) {
invertState.setLength(i-8);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = computeAndGetNorm(sp, invertState);
final float normD = computeNorm(d, "yak", i-8);
final float normS = computeNorm(sp, "yak", i);
assertEquals("f: 6,9: 9<x : i="+i,
normD,
normS,
0.0f);
0.01f);
}
// steepness
for (int i = 9; i < 1000; i++) {
invertState = new FieldInvertState("a");
invertState.setLength(i);
final byte normSS = computeAndGetNorm(sp, invertState);
invertState = new FieldInvertState("b");
invertState.setLength(i);
final byte normS = computeAndGetNorm(sp, invertState);
final float normSS = computeNorm(sp, "a", i);
final float normS = computeNorm(sp, "b", i);
assertTrue("s: i="+i+" : a="+normSS+
" < b="+normS,
normSS < normS);

View File

@ -20,19 +20,24 @@ import java.io.IOException;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.docvalues.FloatDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity.SimWeight;
/**
* Function that returns {@link TFIDFSimilarity#decodeNormValue(long)}
* for every document.
* Function that returns the decoded norm for every document.
* <p>
* Note that the configured Similarity for the field must be
* a subclass of {@link TFIDFSimilarity}
* a subclass of {@link TFIDFSimilarity} and the contribution of
* the TF needs to be 1 when the freq is 1 and the contribution
* of the IDF needs to be 1 when docFreq == docCount == 1.
* @lucene.internal */
public class NormValueSource extends ValueSource {
protected final String field;
@ -61,11 +66,12 @@ public class NormValueSource extends ValueSource {
if (similarity == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
final NumericDocValues norms = readerContext.reader().getNormValues(field);
if (norms == null) {
return new ConstDoubleDocValues(0.0, this);
}
// Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
// is 1 when docCount == docFreq == 1
final SimWeight simWeight = similarity.computeWeight(1f,
new CollectionStatistics(field, 1, 1, 1, 1),
new TermStatistics(new BytesRef("bogus"), 1, 1));
final SimScorer simScorer = similarity.simScorer(simWeight, readerContext);
return new FloatDocValues(this) {
int lastDocID = -1;
@ -74,16 +80,8 @@ public class NormValueSource extends ValueSource {
if (docID < lastDocID) {
throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
}
if (docID > norms.docID()) {
norms.advance(docID);
}
long norm;
if (docID == norms.docID()) {
norm = norms.longValue();
} else {
norm = 0;
}
return similarity.decodeNormValue(norm);
lastDocID = docID;
return simScorer.score(docID, 1f);
}
};
}

View File

@ -21,7 +21,6 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
@ -33,10 +32,9 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -48,7 +46,7 @@ public class TestLongNormValueSource extends LuceneTestCase {
static IndexSearcher searcher;
static Analyzer analyzer;
private static Similarity sim = new PreciseClassicSimilarity();
private static Similarity sim = new ClassicSimilarity();
@BeforeClass
public static void beforeClass() throws Exception {
@ -116,114 +114,3 @@ public class TestLongNormValueSource extends LuceneTestCase {
CheckHits.checkExplanations(q, "", searcher);
}
}
/** Encodes norm as 4-byte float. */
class PreciseClassicSimilarity extends TFIDFSimilarity {
/** Sole constructor: parameter-free */
public PreciseClassicSimilarity() {}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return Float.floatToIntBits(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return Float.intBitsToFloat((int)norm);
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
* #setDiscountOverlaps} is false, else it's {@link
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
*
* @lucene.experimental */
@Override
public float lengthNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return (float) (1.0 / Math.sqrt(numTerms));
}
/** Implemented as <code>sqrt(freq)</code>. */
@Override
public float tf(float freq) {
return (float)Math.sqrt(freq);
}
/** Implemented as <code>1 / (distance + 1)</code>. */
@Override
public float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
/** The default implementation returns <code>1</code> */
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
/** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
@Override
public float idf(long docFreq, long docCount) {
return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {
return "DefaultSimilarity";
}
}

View File

@ -367,7 +367,7 @@ public class TestValueSources extends LuceneTestCase {
// no norm field (so agnostic to indexed similarity)
searcher.setSimilarity(new ClassicSimilarity());
ValueSource vs = new NormValueSource("byte");
assertHits(new FunctionQuery(vs), new float[] { 0f, 0f });
assertHits(new FunctionQuery(vs), new float[] { 1f, 1f });
// regardless of whether norms exist, value source exists == 0
assertAllExist(vs);

View File

@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
@ -143,9 +142,9 @@ public class TestPayloadScoreQuery extends LuceneTestCase {
// check includeSpanScore makes a difference here
searcher.setSimilarity(new MultiplyingSimilarity());
try {
checkQuery(q, new MaxPayloadFunction(), new int[]{ 122, 222 }, new float[]{ 41.802513122558594f, 34.13160705566406f });
checkQuery(q, new MinPayloadFunction(), new int[]{ 222, 122 }, new float[]{ 34.13160705566406f, 20.901256561279297f });
checkQuery(q, new AveragePayloadFunction(), new int[] { 122, 222 }, new float[]{ 38.3189697265625f, 34.13160705566406f });
checkQuery(q, new MaxPayloadFunction(), new int[]{ 122, 222 }, new float[]{ 20.901256561279297f, 17.06580352783203f });
checkQuery(q, new MinPayloadFunction(), new int[]{ 222, 122 }, new float[]{ 17.06580352783203f, 10.450628280639648f });
checkQuery(q, new AveragePayloadFunction(), new int[] { 122, 222 }, new float[]{ 19.15948486328125f, 17.06580352783203f });
checkQuery(q, new MaxPayloadFunction(), false, new int[]{122, 222}, new float[]{4.0f, 4.0f});
checkQuery(q, new MinPayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 2.0f});
checkQuery(q, new AveragePayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 3.666666f});
@ -298,7 +297,7 @@ public class TestPayloadScoreQuery extends LuceneTestCase {
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override
public float lengthNorm(FieldInvertState state) {
public float lengthNorm(int length) {
return 1;
}

View File

@ -25,7 +25,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -268,7 +267,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override
public float lengthNorm(FieldInvertState state) {
public float lengthNorm(int length) {
return 1;
}

View File

@ -31,7 +31,7 @@ import java.util.Random;
* for the same field.
*/
public class RandomSimilarity extends PerFieldSimilarityWrapper {
final ClassicSimilarity defaultSim = new ClassicSimilarity();
final BM25Similarity defaultSim = new BM25Similarity();
final List<Similarity> knownSims;
Map<String,Similarity> previousMappings = new HashMap<>();
final int perFieldSeed;

View File

@ -86,8 +86,8 @@ public class DisMaxRequestHandlerTest extends SolrTestCaseJ4 {
req("cool stuff")
,"//*[@numFound='3']"
,"//result/doc[1]/int[@name='id'][.='42']"
,"//result/doc[2]/int[@name='id'][.='8675309']"
,"//result/doc[3]/int[@name='id'][.='666']"
,"//result/doc[2]/int[@name='id'][.='666']"
,"//result/doc[3]/int[@name='id'][.='8675309']"
);
assertQ("multi qf",

View File

@ -97,8 +97,8 @@ public class QueryElevationComponentTest extends SolrTestCaseJ4 {
CommonParams.FL, "id, score, [elevated]")
, "//*[@numFound='3']"
, "//result/doc[1]/float[@name='id'][.='7.0']"
, "//result/doc[2]/float[@name='id'][.='8.0']"
, "//result/doc[3]/float[@name='id'][.='9.0']",
, "//result/doc[2]/float[@name='id'][.='9.0']"
, "//result/doc[3]/float[@name='id'][.='8.0']",
"//result/doc[1]/bool[@name='[elevated]'][.='true']",
"//result/doc[2]/bool[@name='[elevated]'][.='false']",
"//result/doc[3]/bool[@name='[elevated]'][.='false']"

View File

@ -49,6 +49,6 @@ public class TestPayloadScoreQParserPlugin extends SolrTestCaseJ4 {
// TODO: fix this includeSpanScore test to be less brittle - score result is score of "A" (via BM25) multipled by 1.0 (payload value)
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min}"), "//float[@name='score']='1.0'");
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.25811607'");
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.2876821'");
}
}

View File

@ -65,9 +65,9 @@ public class SortByFunctionTest extends AbstractSolrTestCase {
assertQ(req("fl", "id,score", "q", "f_t:ipod", "sort", "score desc"),
"//*[@numFound='4']",
"//result/doc[1]/int[@name='id'][.='1']",
"//result/doc[2]/int[@name='id'][.='4']",
"//result/doc[3]/int[@name='id'][.='2']",
"//result/doc[4]/int[@name='id'][.='3']"
"//result/doc[2]/int[@name='id'][.='2']",
"//result/doc[3]/int[@name='id'][.='3']",
"//result/doc[4]/int[@name='id'][.='4']"
);

View File

@ -25,7 +25,6 @@ import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.solr.SolrTestCaseJ4;
@ -431,12 +430,8 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
assertQ(req("fl","*,score","q", "{!func}tf(a_tfidf,cow)", "fq","id:6"),
"//float[@name='score']='" + similarity.tf(5) + "'");
FieldInvertState state = new FieldInvertState("a_tfidf");
state.setLength(4);
long norm = similarity.computeNorm(state);
float nrm = similarity.decodeNormValue((byte) norm);
assertQ(req("fl","*,score","q", "{!func}norm(a_tfidf)", "fq","id:2"),
"//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte
"//float[@name='score']='0.5'"); // 1/sqrt(4)==1/2==0.5
}

View File

@ -16,8 +16,22 @@
*/
package org.apache.solr.search.similarities;
import java.io.IOException;
import java.util.Collections;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.junit.BeforeClass;
/**
@ -28,7 +42,38 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
public static void beforeClass() throws Exception {
initCore("solrconfig-basic.xml","schema-sweetspot.xml");
}
private static float computeNorm(Similarity sim, int length) throws IOException {
String value = IntStream.range(0, length).mapToObj(i -> "a").collect(Collectors.joining(" "));
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
w.addDocument(Collections.singleton(newTextField("foo", value, Store.NO)));
DirectoryReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(sim);
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "a")), 0);
reader.close();
dir.close();
Explanation norm = findExplanation(expl, "fieldNorm");
assertNotNull(norm);
return norm.getValue();
}
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().startsWith(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
}
return null;
}
/** default parameters */
public void testDefaults() throws Exception {
SweetSpotSimilarity sim = getSimilarity("text", SweetSpotSimilarity.class);
@ -40,9 +85,9 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
}
// default norm sanity check
assertEquals("norm 1", 1.00F, sim.computeLengthNorm(1), 0.0F);
assertEquals("norm 4", 0.50F, sim.computeLengthNorm(4), 0.0F);
assertEquals("norm 16", 0.25F, sim.computeLengthNorm(16), 0.0F);
assertEquals("norm 1", 1.00F, computeNorm(sim, 1), 0.0F);
assertEquals("norm 4", 0.50F, computeNorm(sim, 4), 0.0F);
assertEquals("norm 16", 0.25F, computeNorm(sim, 16), 0.0F);
}
/** baseline with parameters */
@ -65,17 +110,17 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
// norms: plateau from 3-5
assertEquals("norm 1 == 7",
sim.computeLengthNorm(1), sim.computeLengthNorm(7), 0.0F);
computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
assertEquals("norm 2 == 6",
sim.computeLengthNorm(1), sim.computeLengthNorm(7), 0.0F);
assertEquals("norm 3", 1.00F, sim.computeLengthNorm(3), 0.0F);
assertEquals("norm 4", 1.00F, sim.computeLengthNorm(4), 0.0F);
assertEquals("norm 5", 1.00F, sim.computeLengthNorm(5), 0.0F);
assertTrue("norm 6 too high: " + sim.computeLengthNorm(6),
sim.computeLengthNorm(6) < 1.0F);
computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
assertEquals("norm 3", 1.00F, computeNorm(sim, 3), 0.0F);
assertEquals("norm 4", 1.00F, computeNorm(sim, 4), 0.0F);
assertEquals("norm 5", 1.00F, computeNorm(sim, 5), 0.0F);
assertTrue("norm 6 too high: " + computeNorm(sim, 6),
computeNorm(sim, 6) < 1.0F);
assertTrue("norm 7 higher then norm 6",
sim.computeLengthNorm(7) < sim.computeLengthNorm(6));
assertEquals("norm 20", 0.25F, sim.computeLengthNorm(20), 0.0F);
computeNorm(sim, 7) < computeNorm(sim, 6));
assertEquals("norm 20", 0.25F, computeNorm(sim, 20), 0.0F);
}
/** hyperbolic with parameters */
@ -92,16 +137,16 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
assertEquals("MID tf", 3.3F+(7.7F - 3.3F)/2.0F, sim.tf(5), 0.00001F);
// norms: plateau from 1-5, shallow slope
assertEquals("norm 1", 1.00F, sim.computeLengthNorm(1), 0.0F);
assertEquals("norm 2", 1.00F, sim.computeLengthNorm(2), 0.0F);
assertEquals("norm 3", 1.00F, sim.computeLengthNorm(3), 0.0F);
assertEquals("norm 4", 1.00F, sim.computeLengthNorm(4), 0.0F);
assertEquals("norm 5", 1.00F, sim.computeLengthNorm(5), 0.0F);
assertTrue("norm 6 too high: " + sim.computeLengthNorm(6),
sim.computeLengthNorm(6) < 1.0F);
assertEquals("norm 1", 1.00F, computeNorm(sim, 1), 0.0F);
assertEquals("norm 2", 1.00F, computeNorm(sim, 2), 0.0F);
assertEquals("norm 3", 1.00F, computeNorm(sim, 3), 0.0F);
assertEquals("norm 4", 1.00F, computeNorm(sim, 4), 0.0F);
assertEquals("norm 5", 1.00F, computeNorm(sim, 5), 0.0F);
assertTrue("norm 6 too high: " + computeNorm(sim, 6),
computeNorm(sim, 6) < 1.0F);
assertTrue("norm 7 higher then norm 6",
sim.computeLengthNorm(7) < sim.computeLengthNorm(6));
assertTrue("norm 20 not high enough: " + sim.computeLengthNorm(20),
0.25F < sim.computeLengthNorm(20));
computeNorm(sim, 7) < computeNorm(sim, 6));
assertTrue("norm 20 not high enough: " + computeNorm(sim, 20),
0.25F < computeNorm(sim, 20));
}
}