mirror of https://github.com/apache/lucene.git
LUCENE-7730: Better accuracy for the length normalization factor.
This commit is contained in:
parent
c53d19e7b2
commit
06a6034d9b
|
@ -63,6 +63,9 @@ Improvements
|
|||
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
||||
codec. (Adrien Grand)
|
||||
|
||||
* LUCENE-7730: More accurate encoding of the length normalization factor
|
||||
thanks to the removal of index-time boosts. (Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -603,7 +603,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
// PerField.invert to allow for later downgrading of the index options:
|
||||
fi.setIndexOptions(fieldType.indexOptions());
|
||||
|
||||
fp = new PerField(fi, invert);
|
||||
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
|
||||
fp.next = fieldHash[hashPos];
|
||||
fieldHash[hashPos] = fp;
|
||||
totalFieldCount++;
|
||||
|
@ -633,6 +633,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
/** NOTE: not static: accesses at least docState, termsHash. */
|
||||
private final class PerField implements Comparable<PerField> {
|
||||
|
||||
final int indexCreatedVersionMajor;
|
||||
final FieldInfo fieldInfo;
|
||||
final Similarity similarity;
|
||||
|
||||
|
@ -659,7 +660,8 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
// reused
|
||||
TokenStream tokenStream;
|
||||
|
||||
public PerField(FieldInfo fieldInfo, boolean invert) {
|
||||
public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
|
||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||
this.fieldInfo = fieldInfo;
|
||||
similarity = docState.similarity;
|
||||
if (invert) {
|
||||
|
@ -668,7 +670,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
}
|
||||
|
||||
void setInvertState() {
|
||||
invertState = new FieldInvertState(fieldInfo.name);
|
||||
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
|
||||
termsHashPerField = termsHash.addField(invertState, fieldInfo);
|
||||
if (fieldInfo.omitsNorms() == false) {
|
||||
assert norms == null;
|
||||
|
|
|
@ -193,6 +193,10 @@ class DocumentsWriterPerThread {
|
|||
return fieldInfos;
|
||||
}
|
||||
|
||||
public int getIndexCreatedVersionMajor() {
|
||||
return indexWriter.segmentInfos.getIndexCreatedVersionMajor();
|
||||
}
|
||||
|
||||
final void testPoint(String message) {
|
||||
if (enableTestPoints) {
|
||||
assert infoStream.isEnabled("TP"); // don't enable unless you need them.
|
||||
|
|
|
@ -31,7 +31,8 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public final class FieldInvertState {
|
||||
String name;
|
||||
final int indexCreatedVersionMajor;
|
||||
final String name;
|
||||
int position;
|
||||
int length;
|
||||
int numOverlap;
|
||||
|
@ -50,14 +51,15 @@ public final class FieldInvertState {
|
|||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name. */
|
||||
public FieldInvertState(String name) {
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name) {
|
||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name and values for all fields. */
|
||||
public FieldInvertState(String name, int position, int length, int numOverlap, int offset) {
|
||||
this.name = name;
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
|
||||
this(indexCreatedVersionMajor, name);
|
||||
this.position = position;
|
||||
this.length = length;
|
||||
this.numOverlap = numOverlap;
|
||||
|
@ -164,4 +166,11 @@ public final class FieldInvertState {
|
|||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the version that was used to create the index, or 6 if it was created before 7.0.
|
||||
*/
|
||||
public int getIndexCreatedVersionMajor() {
|
||||
return indexCreatedVersionMajor;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -96,20 +96,6 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
/** The default implementation encodes <code>1 / sqrt(length)</code>
|
||||
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
|
||||
* Lucene's historic implementation: {@link ClassicSimilarity}. If you
|
||||
* change this, then you should change {@link #decodeNormValue(byte)} to match. */
|
||||
protected byte encodeNormValue(int fieldLength) {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(fieldLength)));
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1 / f<sup>2</sup></code>
|
||||
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
|
||||
protected float decodeNormValue(byte b) {
|
||||
return NORM_TABLE[b & 0xFF];
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
|
@ -132,21 +118,31 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (f*f);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
||||
return encodeNormValue(numTerms);
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -207,34 +203,43 @@ public class BM25Similarity extends Similarity {
|
|||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
|
||||
|
||||
float avgdl = avgFieldLength(collectionStats);
|
||||
|
||||
// compute freq-independent part of bm25 equation across all norm values
|
||||
float cache[] = new float[256];
|
||||
float[] oldCache = new float[256];
|
||||
float[] cache = new float[256];
|
||||
for (int i = 0; i < cache.length; i++) {
|
||||
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
|
||||
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
|
||||
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
|
||||
}
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
BM25Stats bm25stats = (BM25Stats) stats;
|
||||
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
|
||||
return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
|
||||
}
|
||||
|
||||
private class BM25DocScorer extends SimScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue; // boost * idf * (k1 + 1)
|
||||
private final NumericDocValues norms;
|
||||
/** precomputed cache for all length values */
|
||||
private final float[] lengthCache;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float[] cache;
|
||||
|
||||
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1);
|
||||
this.cache = stats.cache;
|
||||
this.norms = norms;
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
} else {
|
||||
lengthCache = OLD_LENGTH_TABLE;
|
||||
cache = stats.oldCache;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -245,7 +250,7 @@ public class BM25Similarity extends Similarity {
|
|||
norm = k1;
|
||||
} else {
|
||||
if (norms.advanceExact(doc)) {
|
||||
norm = cache[(byte)norms.longValue() & 0xFF];
|
||||
norm = cache[((byte) norms.longValue()) & 0xFF];
|
||||
} else {
|
||||
norm = cache[0];
|
||||
}
|
||||
|
@ -255,7 +260,7 @@ public class BM25Similarity extends Similarity {
|
|||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
return explainScore(doc, freq, stats, norms, lengthCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -281,21 +286,23 @@ public class BM25Similarity extends Similarity {
|
|||
private final float weight;
|
||||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float cache[];
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
|
||||
* for both OLD_LENGTH_TABLE and LENGTH_TABLE */
|
||||
private final float[] oldCache, cache;
|
||||
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float cache[]) {
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
|
||||
this.field = field;
|
||||
this.boost = boost;
|
||||
this.idf = idf;
|
||||
this.avgdl = avgdl;
|
||||
this.cache = cache;
|
||||
this.weight = idf.getValue() * boost;
|
||||
this.oldCache = oldCache;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "parameter k1"));
|
||||
|
@ -311,7 +318,7 @@ public class BM25Similarity extends Similarity {
|
|||
} else {
|
||||
norm = 0;
|
||||
}
|
||||
float doclen = decodeNormValue(norm);
|
||||
float doclen = lengthCache[norm & 0xff];
|
||||
subs.add(Explanation.match(b, "parameter b"));
|
||||
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
|
||||
subs.add(Explanation.match(doclen, "fieldLength"));
|
||||
|
@ -321,13 +328,13 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
Explanation boostExpl = Explanation.match(stats.boost, "boost");
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
if (boostExpl.getValue() != 1.0f)
|
||||
subs.add(boostExpl);
|
||||
subs.add(stats.idf);
|
||||
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms);
|
||||
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
|
||||
subs.add(tfNormExpl);
|
||||
return Explanation.match(
|
||||
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
|
||||
|
|
|
@ -17,91 +17,27 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
/**
|
||||
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
|
||||
* encodes} norm values as a single byte before being stored. At search time,
|
||||
* the norm byte value is read from the index
|
||||
* {@link org.apache.lucene.store.Directory directory} and
|
||||
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
|
||||
* This encoding/decoding, while reducing index size, comes with the price of
|
||||
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
|
||||
* instance, <i>decode(encode(0.89)) = 0.875</i>.
|
||||
* <p>
|
||||
* Compression of norm values to a single byte saves memory at search time,
|
||||
* because once a field is referenced at search time, its norms - for all
|
||||
* documents - are maintained in memory.
|
||||
* <p>
|
||||
* The rationale supporting such lossy compression of norm values is that given
|
||||
* the difficulty (and inaccuracy) of users to express their true information
|
||||
* need by a query, only big differences matter. <br>
|
||||
* <br>
|
||||
* Last, note that search time is too late to modify this <i>norm</i> part of
|
||||
* scoring, e.g. by using a different {@link Similarity} for search.
|
||||
* Expert: Historical scoring implementation. You might want to consider using
|
||||
* {@link BM25Similarity} instead, which is generally considered superior to
|
||||
* TF-IDF.
|
||||
*/
|
||||
public class ClassicSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public ClassicSimilarity() {}
|
||||
|
||||
/**
|
||||
* Encodes a normalization factor for storage in an index.
|
||||
* <p>
|
||||
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||
* represent are rounded down to the largest representable value. Positive
|
||||
* values too small to represent are rounded up to the smallest positive
|
||||
* representable value.
|
||||
*
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
@Override
|
||||
public final long encodeNormValue(float f) {
|
||||
return SmallFloat.floatToByte315(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the norm value, assuming it is a single byte.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
@Override
|
||||
public final float decodeNormValue(long norm) {
|
||||
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Implemented as
|
||||
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
|
||||
* #setDiscountOverlaps} is false, else it's {@link
|
||||
* FieldInvertState#getLength()} - {@link
|
||||
* FieldInvertState#getNumOverlap()}.
|
||||
* <code>1/sqrt(length)</code>.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
public float lengthNorm(int numTerms) {
|
||||
return (float) (1.0 / Math.sqrt(numTerms));
|
||||
}
|
||||
|
||||
|
@ -138,33 +74,6 @@ public class ClassicSimilarity extends TFIDFSimilarity {
|
|||
public float idf(long docFreq, long docCount) {
|
||||
return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -190,7 +190,8 @@ public abstract class SimilarityBase extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
|
@ -198,12 +199,12 @@ public abstract class SimilarityBase extends Similarity {
|
|||
SimScorer subScorers[] = new SimScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
BasicStats basicstats = (BasicStats) subStats[i];
|
||||
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
return new MultiSimilarity.MultiSimScorer(subScorers);
|
||||
} else {
|
||||
BasicStats basicstats = (BasicStats) stats;
|
||||
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -216,40 +217,38 @@ public abstract class SimilarityBase extends Similarity {
|
|||
|
||||
// ------------------------------ Norm handling ------------------------------
|
||||
|
||||
/** Norm to document length map. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
||||
/** Encodes the document length in the same way as {@link BM25Similarity}. */
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
final float numTerms;
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
return encodeNormValue(numTerms);
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
}
|
||||
|
||||
/** Decodes a normalization factor (document length) stored in an index.
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
protected float decodeNormValue(byte norm) {
|
||||
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Encodes the length to a byte via SmallFloat. */
|
||||
protected byte encodeNormValue(float length) {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(length)));
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- Static methods ------------------------------
|
||||
|
||||
/** Returns the base two logarithm of {@code x}. */
|
||||
|
@ -266,35 +265,37 @@ public abstract class SimilarityBase extends Similarity {
|
|||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
|
||||
* respectively.
|
||||
*/
|
||||
private class BasicSimScorer extends SimScorer {
|
||||
final class BasicSimScorer extends SimScorer {
|
||||
private final BasicStats stats;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normCache;
|
||||
|
||||
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
|
||||
BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
|
||||
}
|
||||
|
||||
private float getNormValue(int doc) throws IOException {
|
||||
float getLengthValue(int doc) throws IOException {
|
||||
if (norms == null) {
|
||||
return 1F;
|
||||
}
|
||||
if (norms.advanceExact(doc)) {
|
||||
return decodeNormValue((byte) norms.longValue());
|
||||
return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
} else {
|
||||
return decodeNormValue((byte) 0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
// We have to supply something in case norms are omitted
|
||||
return SimilarityBase.this.score(stats, freq, getNormValue(doc));
|
||||
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return SimilarityBase.this.explain(stats, doc, freq, getNormValue(doc));
|
||||
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -233,11 +234,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
* And this is exactly what normalizing the query vector <i>V(q)</i>
|
||||
* provides: comparability (to a certain extent) of two or more queries.
|
||||
* </li>
|
||||
*
|
||||
* <li>Applying query normalization on the scores helps to keep the
|
||||
* scores around the unit vector, hence preventing loss of score data
|
||||
* because of floating point precision limitations.
|
||||
* </li>
|
||||
* </ul>
|
||||
* </li>
|
||||
*
|
||||
|
@ -379,13 +375,49 @@ import org.apache.lucene.util.BytesRef;
|
|||
* @see IndexSearcher#setSimilarity(Similarity)
|
||||
*/
|
||||
public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
static final float[] OLD_NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
*/
|
||||
public TFIDFSimilarity() {}
|
||||
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
/** Computes a score factor based on a term or phrase's frequency in a
|
||||
* document. This value is multiplied by the {@link #idf(long, long)}
|
||||
* factor for each term in the query and these products are then summed to
|
||||
|
@ -471,30 +503,25 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
|
||||
/**
|
||||
* Compute an index-time normalization value for this field instance.
|
||||
* <p>
|
||||
* This value will be stored in a single byte lossy representation by
|
||||
* {@link #encodeNormValue(float)}.
|
||||
*
|
||||
* @param state statistics of the current field (such as length, boost, etc)
|
||||
* @return an index-time normalization value
|
||||
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) discounting overlaps}
|
||||
* @return a length normalization value
|
||||
*/
|
||||
public abstract float lengthNorm(FieldInvertState state);
|
||||
public abstract float lengthNorm(int length);
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
float normValue = lengthNorm(state);
|
||||
return encodeNormValue(normValue);
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
if (state.getIndexCreatedVersionMajor() >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315(lengthNorm(numTerms));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a normalization factor stored in an index.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
public abstract float decodeNormValue(long norm);
|
||||
|
||||
/** Encodes a normalization factor for storage in an index. */
|
||||
public abstract long encodeNormValue(float f);
|
||||
|
||||
/** Computes the amount of a sloppy phrase match, based on an edit distance.
|
||||
* This value is summed for each sloppy phrase match in a document to form
|
||||
|
@ -529,24 +556,41 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
final Explanation idf = termStats.length == 1
|
||||
? idfExplain(collectionStats, termStats[0])
|
||||
: idfExplain(collectionStats, termStats);
|
||||
return new IDFStats(collectionStats.field(), boost, idf);
|
||||
float[] normTable = new float[256];
|
||||
for (int i = 1; i < 256; ++i) {
|
||||
int length = SmallFloat.byte4ToInt((byte) i);
|
||||
float norm = lengthNorm(length);
|
||||
normTable[i] = norm;
|
||||
}
|
||||
normTable[0] = 1f / normTable[255];
|
||||
return new IDFStats(collectionStats.field(), boost, idf, normTable);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
IDFStats idfstats = (IDFStats) stats;
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
|
||||
final float[] normTable;
|
||||
if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
|
||||
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
|
||||
normTable = idfstats.normTable;
|
||||
} else {
|
||||
// the norm is directly encoded in the index
|
||||
normTable = OLD_NORM_TABLE;
|
||||
}
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
|
||||
}
|
||||
|
||||
private final class TFIDFSimScorer extends SimScorer {
|
||||
private final IDFStats stats;
|
||||
private final float weightValue;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normTable;
|
||||
|
||||
TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.queryWeight;
|
||||
this.norms = norms;
|
||||
this.normTable = normTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -556,13 +600,13 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
if (norms == null) {
|
||||
return raw;
|
||||
} else {
|
||||
long normValue;
|
||||
float normValue;
|
||||
if (norms.advanceExact(doc)) {
|
||||
normValue = norms.longValue();
|
||||
normValue = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
} else {
|
||||
normValue = 0;
|
||||
}
|
||||
return raw * decodeNormValue(normValue); // normalize for field
|
||||
return raw * normValue; // normalize for field
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -578,35 +622,39 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
return explainScore(doc, freq, stats, norms, normTable);
|
||||
}
|
||||
}
|
||||
|
||||
/** Collection statistics for the TF-IDF model. The only statistic of interest
|
||||
* to this model is idf. */
|
||||
private static class IDFStats extends SimWeight {
|
||||
static class IDFStats extends SimWeight {
|
||||
private final String field;
|
||||
/** The idf and its explanation */
|
||||
private final Explanation idf;
|
||||
private final float boost;
|
||||
private final float queryWeight;
|
||||
final float[] normTable;
|
||||
|
||||
public IDFStats(String field, float boost, Explanation idf) {
|
||||
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
|
||||
// TODO: Validate?
|
||||
this.field = field;
|
||||
this.idf = idf;
|
||||
this.boost = boost;
|
||||
this.queryWeight = boost * idf.getValue();
|
||||
this.normTable = normTable;
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
|
||||
float norm;
|
||||
if (norms != null && norms.advanceExact(doc)) {
|
||||
norm = decodeNormValue(norms.longValue());
|
||||
} else {
|
||||
if (norms == null) {
|
||||
norm = 1f;
|
||||
} else if (norms.advanceExact(doc) == false) {
|
||||
norm = 0f;
|
||||
} else {
|
||||
norm = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
}
|
||||
|
||||
Explanation fieldNormExpl = Explanation.match(
|
||||
|
@ -619,9 +667,9 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
tfExplanation, stats.idf, fieldNormExpl);
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
Explanation queryExpl = Explanation.match(stats.boost, "boost");
|
||||
Explanation fieldExpl = explainField(doc, freq, stats, norms);
|
||||
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
|
||||
if (stats.boost == 1f) {
|
||||
return fieldExpl;
|
||||
}
|
||||
|
|
|
@ -97,31 +97,74 @@ public class SmallFloat {
|
|||
return Float.intBitsToFloat(bits);
|
||||
}
|
||||
|
||||
|
||||
/** floatToByte(b, mantissaBits=5, zeroExponent=2)
|
||||
* <br>smallest nonzero value = 0.033203125
|
||||
* <br>largest value = 1984.0
|
||||
* <br>epsilon = 0.03125
|
||||
*/
|
||||
public static byte floatToByte52(float f) {
|
||||
int bits = Float.floatToRawIntBits(f);
|
||||
int smallfloat = bits >> (24-5);
|
||||
if (smallfloat <= (63-2)<<5) {
|
||||
return (bits<=0) ? (byte)0 : (byte)1;
|
||||
/** Float-like encoding for positive longs that preserves ordering and 4 significant bits. */
|
||||
public static int longToInt4(long i) {
|
||||
if (i < 0) {
|
||||
throw new IllegalArgumentException("Only supports positive values, got " + i);
|
||||
}
|
||||
if (smallfloat >= ((63-2)<<5) + 0x100) {
|
||||
return -1;
|
||||
int numBits = 64 - Long.numberOfLeadingZeros(i);
|
||||
if (numBits < 4) {
|
||||
// subnormal value
|
||||
return Math.toIntExact(i);
|
||||
} else {
|
||||
// normal value
|
||||
int shift = numBits - 4;
|
||||
// only keep the 5 most significant bits
|
||||
int encoded = Math.toIntExact(i >>> shift);
|
||||
// clear the most significant bit, which is implicit
|
||||
encoded &= 0x07;
|
||||
// encode the shift, adding 1 because 0 is reserved for subnormal values
|
||||
encoded |= (shift + 1) << 3;
|
||||
return encoded;
|
||||
}
|
||||
return (byte)(smallfloat - ((63-2)<<5));
|
||||
}
|
||||
|
||||
/** byteToFloat(b, mantissaBits=5, zeroExponent=2) */
|
||||
public static float byte52ToFloat(byte b) {
|
||||
// on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup
|
||||
// is only a little bit faster (anywhere from 0% to 7%)
|
||||
if (b == 0) return 0.0f;
|
||||
int bits = (b&0xff) << (24-5);
|
||||
bits += (63-2) << 24;
|
||||
return Float.intBitsToFloat(bits);
|
||||
/**
|
||||
* Decode values encoded with {@link #longToInt4(long)}.
|
||||
*/
|
||||
public static final long int4ToLong(int i) {
|
||||
long bits = i & 0x07;
|
||||
int shift = (i >>> 3) - 1;
|
||||
long decoded;
|
||||
if (shift == -1) {
|
||||
// subnormal value
|
||||
decoded = bits;
|
||||
} else {
|
||||
// normal value
|
||||
decoded = (bits | 0x08) << shift;
|
||||
}
|
||||
return decoded;
|
||||
}
|
||||
|
||||
private static final int MAX_INT4 = longToInt4(Integer.MAX_VALUE);
|
||||
private static final int NUM_FREE_VALUES = 255 - MAX_INT4;
|
||||
|
||||
/**
|
||||
* Encode an integer to a byte. It is built upon {@link #longToInt4(long)}
|
||||
* and leverages the fact that {@code longToInt4(Integer.MAX_VALUE)} is
|
||||
* less than 255 to encode low values more accurately.
|
||||
*/
|
||||
public static byte intToByte4(int i) {
|
||||
if (i < 0) {
|
||||
throw new IllegalArgumentException("Only supports positive values, got " + i);
|
||||
}
|
||||
if (i < NUM_FREE_VALUES) {
|
||||
return (byte) i;
|
||||
} else {
|
||||
return (byte) (NUM_FREE_VALUES + longToInt4(i - NUM_FREE_VALUES));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode values that have been encoded with {@link #intToByte4(int)}.
|
||||
*/
|
||||
public static int byte4ToInt(byte b) {
|
||||
int i = Byte.toUnsignedInt(b);
|
||||
if (i < NUM_FREE_VALUES) {
|
||||
return i;
|
||||
} else {
|
||||
long decoded = NUM_FREE_VALUES + int4ToLong(i - NUM_FREE_VALUES);
|
||||
return Math.toIntExact(decoded);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2441,7 +2441,7 @@ public class TestIndexSorting extends LuceneTestCase {
|
|||
assertTrue(sparseValues.advanceExact(docID));
|
||||
assertTrue(sparseBinaryValues.advanceExact(docID));
|
||||
assertTrue(normsValues.advanceExact(docID));
|
||||
assertEquals(124, normsValues.longValue());
|
||||
assertEquals(1, normsValues.longValue());
|
||||
assertEquals(127-docID, (int) sparseValues.longValue());
|
||||
assertEquals(new BytesRef(Integer.toString(127-docID)), sparseBinaryValues.binaryValue());
|
||||
} else {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
@ -26,7 +27,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -35,12 +38,12 @@ import org.apache.lucene.util.TestUtil;
|
|||
/**
|
||||
* Tests the maxTermFrequency statistic in FieldInvertState
|
||||
*/
|
||||
public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
Directory dir;
|
||||
IndexReader reader;
|
||||
/* expected maxTermFrequency values for our documents */
|
||||
ArrayList<Integer> expected = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
@ -59,14 +62,14 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
|||
reader = writer.getReader();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
|
||||
public void test() throws Exception {
|
||||
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
|
@ -95,30 +98,42 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
|||
Collections.shuffle(terms, random());
|
||||
return Arrays.toString(terms.toArray(new String[terms.size()]));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple similarity that encodes maxTermFrequency directly as a byte
|
||||
*/
|
||||
static class TestSimilarity extends TFIDFSimilarity {
|
||||
static class TestSimilarity extends Similarity {
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return state.getMaxTermFrequency();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (byte) f;
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new SimWeight() {};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
return new SimScorer() {
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,13 +32,11 @@ import org.apache.lucene.search.TermStatistics;
|
|||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
|
@ -49,67 +47,6 @@ import org.apache.lucene.util.TestUtil;
|
|||
@Slow
|
||||
public class TestNorms extends LuceneTestCase {
|
||||
static final String BYTE_TEST_FIELD = "normsTestByte";
|
||||
|
||||
static class CustomNormEncodingSimilarity extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return state.getLength();
|
||||
}
|
||||
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
|
||||
// LUCENE-1260
|
||||
public void testCustomEncoder() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
|
||||
IndexWriterConfig config = newIndexWriterConfig(analyzer);
|
||||
config.setSimilarity(new CustomNormEncodingSimilarity());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
|
||||
Document doc = new Document();
|
||||
Field foo = newTextField("foo", "", Field.Store.NO);
|
||||
Field bar = newTextField("bar", "", Field.Store.NO);
|
||||
doc.add(foo);
|
||||
doc.add(bar);
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
bar.setStringValue("singleton");
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
assertEquals(i, fooNorms.nextDoc());
|
||||
assertEquals(0, fooNorms.longValue());
|
||||
}
|
||||
|
||||
NumericDocValues barNorms = MultiDocValues.getNormValues(reader, "bar");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
assertEquals(i, barNorms.nextDoc());
|
||||
assertEquals(1, barNorms.longValue());
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testMaxByteNorms() throws IOException {
|
||||
Directory dir = newFSDirectory(createTempDir("TestNorms.testMaxByteNorms"));
|
||||
|
|
|
@ -44,9 +44,7 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
public class TestOmitTf extends LuceneTestCase {
|
||||
|
||||
public static class SimpleSimilarity extends TFIDFSimilarity {
|
||||
@Override public float decodeNormValue(long norm) { return norm; }
|
||||
@Override public long encodeNormValue(float f) { return (long) f; }
|
||||
@Override public float lengthNorm(FieldInvertState state) { return 1; }
|
||||
@Override public float lengthNorm(int length) { return 1; }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -72,7 +71,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public float lengthNorm(int length) {
|
||||
// Disable length norm
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FieldValueHitQueue.Entry;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -63,7 +64,7 @@ public class TestElevationComparator extends LuceneTestCase {
|
|||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
searcher.setSimilarity(new ClassicSimilarity());
|
||||
searcher.setSimilarity(new BM25Similarity());
|
||||
|
||||
runTest(searcher, true);
|
||||
runTest(searcher, false);
|
||||
|
@ -98,11 +99,11 @@ public class TestElevationComparator extends LuceneTestCase {
|
|||
assertEquals(3, topDocs.scoreDocs[1].doc);
|
||||
|
||||
if (reversed) {
|
||||
assertEquals(2, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(1, topDocs.scoreDocs[3].doc);
|
||||
} else {
|
||||
assertEquals(1, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(2, topDocs.scoreDocs[3].doc);
|
||||
} else {
|
||||
assertEquals(2, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(1, topDocs.scoreDocs[3].doc);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -309,7 +310,7 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||
newIndexWriterConfig(new MockAnalyzer(random()))
|
||||
.setMergePolicy(newLogMergePolicy())
|
||||
.setSimilarity(new ClassicSimilarity()));
|
||||
.setSimilarity(new BM25Similarity()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "foo firstname lastname foo", Field.Store.YES));
|
||||
|
@ -335,9 +336,9 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
// each other get a higher score:
|
||||
assertEquals(1.0, hits[0].score, 0.01);
|
||||
assertEquals(0, hits[0].doc);
|
||||
assertEquals(0.62, hits[1].score, 0.01);
|
||||
assertEquals(0.63, hits[1].score, 0.01);
|
||||
assertEquals(1, hits[1].doc);
|
||||
assertEquals(0.43, hits[2].score, 0.01);
|
||||
assertEquals(0.47, hits[2].score, 0.01);
|
||||
assertEquals(2, hits[2].doc);
|
||||
QueryUtils.check(random(), query,searcher);
|
||||
reader.close();
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -49,9 +50,14 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
return searcher;
|
||||
}
|
||||
|
||||
public static IndexWriterConfig newIndexWriterConfig() {
|
||||
// We rely on more tokens = lower score:
|
||||
return LuceneTestCase.newIndexWriterConfig().setSimilarity(new ClassicSimilarity());
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "0", Field.Store.YES));
|
||||
|
@ -106,7 +112,7 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
// Test LUCENE-5682
|
||||
public void testNullScorerTermQuery() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "0", Field.Store.YES));
|
||||
|
@ -145,7 +151,7 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
|
||||
public void testCustomCombine() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "0", Field.Store.YES));
|
||||
|
@ -196,7 +202,7 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
|
||||
public void testExplain() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "0", Field.Store.YES));
|
||||
|
@ -271,7 +277,7 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
|
||||
public void testMissingSecondPassScore() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "0", Field.Store.YES));
|
||||
|
@ -325,7 +331,7 @@ public class TestQueryRescorer extends LuceneTestCase {
|
|||
public void testRandom() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
int numDocs = atLeast(1000);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
|
||||
|
||||
final int[] idToNum = new int[numDocs];
|
||||
int maxValue = TestUtil.nextInt(random(), 10, 1000000);
|
||||
|
|
|
@ -17,20 +17,18 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/** Similarity unit test.
|
||||
*
|
||||
|
@ -39,7 +37,7 @@ import org.apache.lucene.document.Document;
|
|||
public class TestSimilarity extends LuceneTestCase {
|
||||
|
||||
public static class SimpleSimilarity extends ClassicSimilarity {
|
||||
@Override public float lengthNorm(FieldInvertState state) { return 1; }
|
||||
@Override public float lengthNorm(int length) { return 1; }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
|
||||
|
|
|
@ -17,19 +17,21 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -38,7 +40,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
private Directory directory;
|
||||
private DirectoryReader reader;
|
||||
private IndexSearcher searcher;
|
||||
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
@ -51,7 +53,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
doc.add(field);
|
||||
Field field2 = newTextField("bar", "", Field.Store.NO);
|
||||
doc.add(field2);
|
||||
|
||||
|
||||
field.setStringValue("quick brown fox");
|
||||
field2.setStringValue("quick brown fox");
|
||||
iw.addDocument(doc);
|
||||
|
@ -63,14 +65,14 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(sim);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
// sanity check of norms writer
|
||||
// TODO: generalize
|
||||
|
@ -81,7 +83,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
assertEquals(i, barNorms.nextDoc());
|
||||
assertFalse(fooNorms.longValue() == barNorms.longValue());
|
||||
}
|
||||
|
||||
|
||||
// sanity check of searching
|
||||
TopDocs foodocs = searcher.search(new TermQuery(new Term("foo", "brown")), 10);
|
||||
assertTrue(foodocs.totalHits > 0);
|
||||
|
@ -89,11 +91,11 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
assertTrue(bardocs.totalHits > 0);
|
||||
assertTrue(foodocs.scoreDocs[0].score < bardocs.scoreDocs[0].score);
|
||||
}
|
||||
|
||||
|
||||
private static class ExampleSimilarityProvider extends PerFieldSimilarityWrapper {
|
||||
private Similarity sim1 = new Sim1();
|
||||
private Similarity sim2 = new Sim2();
|
||||
|
||||
|
||||
@Override
|
||||
public Similarity get(String field) {
|
||||
if (field.equals("foo")) {
|
||||
|
@ -103,80 +105,73 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class Sim1 extends TFIDFSimilarity {
|
||||
|
||||
|
||||
private static class Sim1 extends Similarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return 1f;
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new SimWeight() {};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sloppyFreq(int distance) {
|
||||
return 1f;
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
return new SimScorer() {
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tf(float freq) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||
return 1f;
|
||||
}
|
||||
}
|
||||
|
||||
private static class Sim2 extends TFIDFSimilarity {
|
||||
|
||||
|
||||
private static class Sim2 extends Similarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return 10f;
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return 10;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float sloppyFreq(int distance) {
|
||||
return 10f;
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new SimWeight() {};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tf(float freq) {
|
||||
return 10f;
|
||||
}
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
return new SimScorer() {
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 10f;
|
||||
}
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return 10;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||
return 1f;
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TestSortRescorer extends LuceneTestCase {
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setSimilarity(new ClassicSimilarity()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "1", Field.Store.YES));
|
||||
|
|
|
@ -20,19 +20,6 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
|
||||
public class TestAxiomaticSimilarity extends LuceneTestCase {
|
||||
|
||||
public void testSaneNormValues() {
|
||||
Axiomatic sim = new AxiomaticF2EXP();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
|
||||
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
|
||||
assertFalse("nan len for byte=" + i, Float.isNaN(len));
|
||||
if (i > 0) {
|
||||
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte) (i - 1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testIllegalS() {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
new AxiomaticF2EXP(Float.POSITIVE_INFINITY, 0.1f);
|
||||
|
|
|
@ -17,23 +17,27 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestBM25Similarity extends LuceneTestCase {
|
||||
|
||||
public void testSaneNormValues() {
|
||||
BM25Similarity sim = new BM25Similarity();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
|
||||
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
|
||||
assertFalse("nan len for byte=" + i, Float.isNaN(len));
|
||||
if (i > 0) {
|
||||
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testIllegalK1() {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
new BM25Similarity(Float.POSITIVE_INFINITY, 0.75f);
|
||||
|
@ -72,4 +76,44 @@ public class TestBM25Similarity extends LuceneTestCase {
|
|||
});
|
||||
assertTrue(expected.getMessage().contains("illegal b value"));
|
||||
}
|
||||
|
||||
public void testLengthEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = new BM25Similarity();
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
|
||||
Explanation docLen = findExplanation(expl, "fieldLength");
|
||||
assertNotNull(docLen);
|
||||
assertEquals(docLen.toString(), length, (int) docLen.getValue());
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().equals(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.search.TopDocs;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestBooleanSimilarity extends LuceneTestCase {
|
||||
|
||||
|
@ -105,8 +106,8 @@ public class TestBooleanSimilarity extends LuceneTestCase {
|
|||
for (int iter = 0; iter < 100; ++iter) {
|
||||
final int length = TestUtil.nextInt(random(), 1, 100);
|
||||
final int position = random().nextInt(length);
|
||||
final int numOverlaps = random().nextInt(50);
|
||||
FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100);
|
||||
final int numOverlaps = random().nextInt(length);
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
||||
assertEquals(
|
||||
sim2.computeNorm(state),
|
||||
sim1.computeNorm(state),
|
||||
|
|
|
@ -19,24 +19,34 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestClassicSimilarity extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
|
@ -63,14 +73,6 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
IOUtils.close(indexReader, directory);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
// Javadocs give this as an example so we test to make sure it's correct:
|
||||
public void testPrecisionLoss() throws Exception {
|
||||
ClassicSimilarity sim = new ClassicSimilarity();
|
||||
float v = sim.decodeNormValue(sim.encodeNormValue(.89f));
|
||||
assertEquals(0.875f, v, 0.0001f);
|
||||
}
|
||||
|
||||
|
||||
public void testHit() throws IOException {
|
||||
Query query = new TermQuery(new Term("test", "hit"));
|
||||
|
@ -159,16 +161,83 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
assertTrue(topDocs.scoreDocs[0].score != 0);
|
||||
}
|
||||
|
||||
public void testSaneNormValues() {
|
||||
public void testSaneNormValues() throws IOException {
|
||||
ClassicSimilarity sim = new ClassicSimilarity();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float boost = sim.decodeNormValue((byte) i);
|
||||
float boost = TFIDFSimilarity.OLD_NORM_TABLE[i];
|
||||
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
|
||||
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
|
||||
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
|
||||
if (i > 0) {
|
||||
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
|
||||
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > TFIDFSimilarity.OLD_NORM_TABLE[i-1]);
|
||||
}
|
||||
}
|
||||
|
||||
TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, new IndexSearcher(new MultiReader()).collectionStatistics("foo"));
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float boost = stats.normTable[i];
|
||||
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
|
||||
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
|
||||
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
|
||||
if (i > 0) {
|
||||
assertTrue("boost is not decreasing: " + boost + ",byte=" + i, boost < stats.normTable[i-1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testNormEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = new ClassicSimilarity();
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 4, 16 }) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
|
||||
Explanation fieldNorm = findExplanation(expl, "fieldNorm");
|
||||
assertNotNull(fieldNorm);
|
||||
assertEquals(fieldNorm.toString(), 1/Math.sqrt(length), fieldNorm.getValue(), 0f);
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().startsWith(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void testSameNormsAsBM25() {
|
||||
ClassicSimilarity sim1 = new ClassicSimilarity();
|
||||
BM25Similarity sim2 = new BM25Similarity();
|
||||
sim2.setDiscountOverlaps(true);
|
||||
for (int iter = 0; iter < 100; ++iter) {
|
||||
final int length = TestUtil.nextInt(random(), 1, 1000);
|
||||
final int position = random().nextInt(length);
|
||||
final int numOverlaps = random().nextInt(length);
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
||||
assertEquals(
|
||||
sim2.computeNorm(state),
|
||||
sim1.computeNorm(state),
|
||||
0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,16 +20,23 @@ package org.apache.lucene.search.similarities;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -37,9 +44,13 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimWeight;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
/**
|
||||
* Tests the {@link SimilarityBase}-based Similarities. Contains unit tests and
|
||||
|
@ -586,11 +597,11 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
|
||||
// LUCENE-5221
|
||||
public void testDiscountOverlapsBoost() throws IOException {
|
||||
ClassicSimilarity expected = new ClassicSimilarity();
|
||||
BM25Similarity expected = new BM25Similarity();
|
||||
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
|
||||
expected.setDiscountOverlaps(false);
|
||||
actual.setDiscountOverlaps(false);
|
||||
FieldInvertState state = new FieldInvertState("foo");
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo");
|
||||
state.setLength(5);
|
||||
state.setNumOverlap(2);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
|
@ -598,64 +609,32 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
actual.setDiscountOverlaps(true);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
}
|
||||
|
||||
public void testSaneNormValues() {
|
||||
for (SimilarityBase sim : sims) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
|
||||
assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
|
||||
assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
|
||||
if (i > 0) {
|
||||
assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure the similarity does not go crazy when tested against all possible norm values.
|
||||
*/
|
||||
public void testCrazyIndexTimeBoosts() throws Exception {
|
||||
long avgLength = 750;
|
||||
long docCount = 500000;
|
||||
long numTokens = docCount * avgLength;
|
||||
|
||||
CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
|
||||
|
||||
long docFreq = 2000;
|
||||
long totalTermFreq = 2000 * avgLength;
|
||||
|
||||
TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
|
||||
|
||||
for (SimilarityBase sim : sims) {
|
||||
if (sim instanceof IBSimilarity) {
|
||||
if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
|
||||
// score goes infinite for tiny doc lengths and negative for huge doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
}
|
||||
} else if (sim instanceof DFRSimilarity) {
|
||||
BasicModel model = ((DFRSimilarity)sim).getBasicModel();
|
||||
if (model instanceof BasicModelD || model instanceof BasicModelP) {
|
||||
// score goes NaN for tiny doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
} else if (model instanceof BasicModelBE) {
|
||||
// score goes negative infinity for tiny doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
}
|
||||
}
|
||||
BasicStats stats = (BasicStats) sim.computeWeight(1f, collectionStats, termStats);
|
||||
for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
float score = sim.score(stats, tf, len);
|
||||
assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
|
||||
assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
|
||||
assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
|
||||
}
|
||||
|
||||
public void testLengthEncodingBackwardCompatibility() throws IOException {
|
||||
Similarity similarity = RandomPicks.randomFrom(random(), sims);
|
||||
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
|
||||
for (int length : new int[] {1, 2, 4}) { // these length values are encoded accurately on both cases
|
||||
Directory dir = newDirectory();
|
||||
// set the version on the directory
|
||||
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
|
||||
Document doc = new Document();
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
|
||||
doc.add(new TextField("foo", value, Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = DirectoryReader.open(w);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(similarity);
|
||||
Term term = new Term("foo", "b");
|
||||
TermContext context = TermContext.build(reader.getContext(), term);
|
||||
SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
|
||||
SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
|
||||
float docLength = simScorer.getLengthValue(0);
|
||||
assertEquals(length, (int) docLength);
|
||||
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestSmallFloat extends LuceneTestCase {
|
||||
|
||||
// original lucene byteToFloat
|
||||
|
@ -87,10 +89,6 @@ public class TestSmallFloat extends LuceneTestCase {
|
|||
float f3 = SmallFloat.byte315ToFloat((byte)i);
|
||||
assertEquals(f1,f2,0.0);
|
||||
assertEquals(f2,f3,0.0);
|
||||
|
||||
float f4 = SmallFloat.byteToFloat((byte)i,5,2);
|
||||
float f5 = SmallFloat.byte52ToFloat((byte)i);
|
||||
assertEquals(f4,f5,0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -121,10 +119,51 @@ public class TestSmallFloat extends LuceneTestCase {
|
|||
byte b3 = SmallFloat.floatToByte315(f);
|
||||
assertEquals(b1,b2);
|
||||
assertEquals(b2,b3);
|
||||
}
|
||||
}
|
||||
|
||||
byte b4 = SmallFloat.floatToByte(f,5,2);
|
||||
byte b5 = SmallFloat.floatToByte52(f);
|
||||
assertEquals(b4,b5);
|
||||
public void testInt4() {
|
||||
for (int i = 0; i <= 16; ++i) {
|
||||
// all values in 0-16 are encoded accurately
|
||||
assertEquals(i, SmallFloat.int4ToLong(SmallFloat.longToInt4(i)));
|
||||
}
|
||||
final int maxEncoded = SmallFloat.longToInt4(Long.MAX_VALUE);
|
||||
for (int i = 1; i < maxEncoded; ++i) {
|
||||
assertTrue(SmallFloat.int4ToLong(i) > SmallFloat.int4ToLong(i - 1));
|
||||
}
|
||||
final int iters = atLeast(1000);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final long l = TestUtil.nextLong(random(), 0, 1L << TestUtil.nextInt(random(), 5, 61));
|
||||
int numBits = 64 - Long.numberOfLeadingZeros(l);
|
||||
long expected = l;
|
||||
if (numBits > 4) {
|
||||
long mask = ~0L << (numBits - 4);
|
||||
expected &= mask;
|
||||
}
|
||||
long l2 = SmallFloat.int4ToLong(SmallFloat.longToInt4(l));
|
||||
assertEquals(expected, l2);
|
||||
}
|
||||
}
|
||||
|
||||
public void testByte4() {
|
||||
int[] decoded = new int[256];
|
||||
for (int b = 0; b < 256; ++b) {
|
||||
decoded[b] = SmallFloat.byte4ToInt((byte) b);
|
||||
assertEquals((byte) b, SmallFloat.intToByte4(decoded[b]));
|
||||
}
|
||||
for (int i = 1; i < 256; ++i) {
|
||||
assertTrue(decoded[i] > decoded[i-1]);
|
||||
}
|
||||
assertEquals((byte) 255, SmallFloat.intToByte4(Integer.MAX_VALUE));
|
||||
final int iters = atLeast(1000);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int i = random().nextInt(1 << TestUtil.nextInt(random(), 5, 30));
|
||||
int idx = Arrays.binarySearch(decoded, i);
|
||||
if (idx < 0) {
|
||||
idx = -2 - idx;
|
||||
}
|
||||
assertTrue(decoded[idx] <= i);
|
||||
assertEquals((byte) idx, SmallFloat.intToByte4(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -146,5 +185,4 @@ public class TestSmallFloat extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
***/
|
||||
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ public class TestExpressionRescorer extends LuceneTestCase {
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setSimilarity(new ClassicSimilarity()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("id", "1", Field.Store.YES));
|
||||
|
|
|
@ -72,6 +72,8 @@ import org.apache.lucene.search.PhraseQuery.Builder;
|
|||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RegexpQuery;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.SynonymQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
|
@ -147,7 +149,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
CustomScoreQuery query = new CustomScoreQuery(termQuery);
|
||||
|
||||
searcher = newSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
|
||||
assertEquals(2, hits.totalHits);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(scorer);
|
||||
|
@ -199,7 +201,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
query.add(new Term(FIELD_NAME, "very"));
|
||||
|
||||
searcher = newSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
|
||||
assertEquals(2, hits.totalHits);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(scorer);
|
||||
|
@ -271,7 +273,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
};
|
||||
|
||||
searcher = newSearcher(reader);
|
||||
TopDocs hits = searcher.search(query, 10);
|
||||
TopDocs hits = searcher.search(query, 10, new Sort(SortField.FIELD_DOC, SortField.FIELD_SCORE));
|
||||
assertEquals(2, hits.totalHits);
|
||||
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
|
||||
Highlighter highlighter = new Highlighter(scorer);
|
||||
|
|
|
@ -892,7 +892,7 @@ public class MemoryIndex {
|
|||
|
||||
NumericDocValues getNormDocValues() {
|
||||
if (norm == null) {
|
||||
FieldInvertState invertState = new FieldInvertState(fieldInfo.name, fieldInfo.number,
|
||||
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number,
|
||||
numTokens, numOverlapTokens, 0);
|
||||
final long value = normSimilarity.computeNorm(invertState);
|
||||
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);
|
||||
|
|
|
@ -50,6 +50,7 @@ import org.apache.lucene.index.IndexOptions;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
|
@ -57,13 +58,16 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
@ -145,32 +149,32 @@ public class TestMemoryIndex extends LuceneTestCase {
|
|||
|
||||
assertEquals(reader.getTermVectors(0).size(), 1);
|
||||
}
|
||||
|
||||
|
||||
public void testReaderConsistency() throws IOException {
|
||||
Analyzer analyzer = new MockPayloadAnalyzer();
|
||||
|
||||
|
||||
// defaults
|
||||
MemoryIndex mi = new MemoryIndex();
|
||||
mi.addField("field", "some terms be here", analyzer);
|
||||
TestUtil.checkReader(mi.createSearcher().getIndexReader());
|
||||
|
||||
|
||||
// all combinations of offsets/payloads options
|
||||
mi = new MemoryIndex(true, true);
|
||||
mi.addField("field", "some terms be here", analyzer);
|
||||
TestUtil.checkReader(mi.createSearcher().getIndexReader());
|
||||
|
||||
|
||||
mi = new MemoryIndex(true, false);
|
||||
mi.addField("field", "some terms be here", analyzer);
|
||||
TestUtil.checkReader(mi.createSearcher().getIndexReader());
|
||||
|
||||
|
||||
mi = new MemoryIndex(false, true);
|
||||
mi.addField("field", "some terms be here", analyzer);
|
||||
TestUtil.checkReader(mi.createSearcher().getIndexReader());
|
||||
|
||||
|
||||
mi = new MemoryIndex(false, false);
|
||||
mi.addField("field", "some terms be here", analyzer);
|
||||
TestUtil.checkReader(mi.createSearcher().getIndexReader());
|
||||
|
||||
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
|
@ -187,11 +191,23 @@ public class TestMemoryIndex extends LuceneTestCase {
|
|||
float n1 = norms.longValue();
|
||||
|
||||
// Norms are re-computed when we change the Similarity
|
||||
mi.setSimilarity(new ClassicSimilarity() {
|
||||
mi.setSimilarity(new Similarity() {
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return 74;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
});
|
||||
norms = reader.getNormValues("f1");
|
||||
assertEquals(0, norms.nextDoc());
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.lucene.misc;
|
||||
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
@ -86,7 +85,7 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
|
|||
* Sets the default function variables used by lengthNorm when no field
|
||||
* specific variables have been set.
|
||||
*
|
||||
* @see #computeLengthNorm
|
||||
* @see #lengthNorm
|
||||
*/
|
||||
public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) {
|
||||
this.ln_min = min;
|
||||
|
@ -94,25 +93,6 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
|
|||
this.ln_steep = steepness;
|
||||
this.discountOverlaps = discountOverlaps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implemented as <code> state.getBoost() *
|
||||
* computeLengthNorm(numTokens) </code> where
|
||||
* numTokens does not count overlap tokens if
|
||||
* discountOverlaps is true by default or true for this
|
||||
* specific field.
|
||||
*/
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
final int numTokens;
|
||||
|
||||
if (discountOverlaps)
|
||||
numTokens = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTokens = state.getLength();
|
||||
|
||||
return computeLengthNorm(numTokens);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implemented as:
|
||||
|
@ -133,7 +113,8 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
|
|||
* @see #setLengthNormFactors
|
||||
* @see <a href="doc-files/ss.computeLengthNorm.svg">An SVG visualization of this function</a>
|
||||
*/
|
||||
public float computeLengthNorm(int numTerms) {
|
||||
@Override
|
||||
public float lengthNorm(int numTerms) {
|
||||
final int l = ln_min;
|
||||
final int h = ln_max;
|
||||
final float s = ln_steep;
|
||||
|
|
|
@ -16,27 +16,62 @@
|
|||
*/
|
||||
package org.apache.lucene.misc;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
|
||||
/**
|
||||
* Test of the SweetSpotSimilarity
|
||||
*/
|
||||
public class SweetSpotSimilarityTest extends LuceneTestCase {
|
||||
|
||||
public static float computeAndDecodeNorm(SweetSpotSimilarity decode, Similarity encode, FieldInvertState state) {
|
||||
return decode.decodeNormValue(computeAndGetNorm(encode, state));
|
||||
}
|
||||
|
||||
public static byte computeAndGetNorm(Similarity s, FieldInvertState state) {
|
||||
return (byte) s.computeNorm(state);
|
||||
private static float computeNorm(Similarity sim, String field, int length) throws IOException {
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "a").collect(Collectors.joining(" "));
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
|
||||
w.addDocument(Collections.singleton(newTextField(field, value, Store.NO)));
|
||||
DirectoryReader reader = DirectoryReader.open(w);
|
||||
w.close();
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
searcher.setSimilarity(sim);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term(field, "a")), 0);
|
||||
reader.close();
|
||||
dir.close();
|
||||
Explanation norm = findExplanation(expl, "fieldNorm");
|
||||
assertNotNull(norm);
|
||||
return norm.getValue();
|
||||
}
|
||||
|
||||
public void testSweetSpotComputeNorm() {
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().startsWith(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void testSweetSpotComputeNorm() throws IOException {
|
||||
|
||||
final SweetSpotSimilarity ss = new SweetSpotSimilarity();
|
||||
ss.setLengthNormFactors(1,1,0.5f,true);
|
||||
|
@ -46,12 +81,10 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
|
|||
|
||||
|
||||
// base case, should degrade
|
||||
FieldInvertState invertState = new FieldInvertState("bogus");
|
||||
for (int i = 1; i < 1000; i++) {
|
||||
invertState.setLength(i);
|
||||
assertEquals("base case: i="+i,
|
||||
computeAndGetNorm(d, invertState),
|
||||
computeAndGetNorm(s, invertState),
|
||||
computeNorm(d, "bogus", i),
|
||||
computeNorm(s, "bogus", i),
|
||||
0.0f);
|
||||
}
|
||||
|
||||
|
@ -60,22 +93,19 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
|
|||
ss.setLengthNormFactors(3,10,0.5f,true);
|
||||
|
||||
for (int i = 3; i <=10; i++) {
|
||||
invertState.setLength(i);
|
||||
assertEquals("3,10: spot i="+i,
|
||||
1.0f,
|
||||
computeAndDecodeNorm(ss, ss, invertState),
|
||||
computeNorm(ss, "bogus", i),
|
||||
0.0f);
|
||||
}
|
||||
|
||||
for (int i = 10; i < 1000; i++) {
|
||||
invertState.setLength(i-9);
|
||||
final byte normD = computeAndGetNorm(d, invertState);
|
||||
invertState.setLength(i);
|
||||
final byte normS = computeAndGetNorm(s, invertState);
|
||||
final float normD = computeNorm(d, "bogus", i - 9);
|
||||
final float normS = computeNorm(s, "bogus", i);
|
||||
assertEquals("3,10: 10<x : i="+i,
|
||||
normD,
|
||||
normS,
|
||||
0.0f);
|
||||
0.01f);
|
||||
}
|
||||
|
||||
|
||||
|
@ -106,78 +136,60 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
invertState = new FieldInvertState("foo");
|
||||
for (int i = 3; i <=10; i++) {
|
||||
invertState.setLength(i);
|
||||
assertEquals("f: 3,10: spot i="+i,
|
||||
1.0f,
|
||||
computeAndDecodeNorm(ss, sp, invertState),
|
||||
computeNorm(sp, "foo", i),
|
||||
0.0f);
|
||||
}
|
||||
|
||||
for (int i = 10; i < 1000; i++) {
|
||||
invertState.setLength(i-9);
|
||||
final byte normD = computeAndGetNorm(d, invertState);
|
||||
invertState.setLength(i);
|
||||
final byte normS = computeAndGetNorm(sp, invertState);
|
||||
final float normD = computeNorm(d, "foo", i-9);
|
||||
final float normS = computeNorm(sp, "foo", i);
|
||||
assertEquals("f: 3,10: 10<x : i="+i,
|
||||
normD,
|
||||
normS,
|
||||
0.0f);
|
||||
0.01f);
|
||||
}
|
||||
|
||||
invertState = new FieldInvertState("bar");
|
||||
for (int i = 8; i <=13; i++) {
|
||||
invertState.setLength(i);
|
||||
assertEquals("f: 8,13: spot i="+i,
|
||||
1.0f,
|
||||
computeAndDecodeNorm(ss, sp, invertState),
|
||||
0.0f);
|
||||
computeNorm(sp, "bar", i),
|
||||
0.01f);
|
||||
}
|
||||
|
||||
invertState = new FieldInvertState("yak");
|
||||
for (int i = 6; i <=9; i++) {
|
||||
invertState.setLength(i);
|
||||
assertEquals("f: 6,9: spot i="+i,
|
||||
1.0f,
|
||||
computeAndDecodeNorm(ss, sp, invertState),
|
||||
0.0f);
|
||||
computeNorm(sp, "yak", i),
|
||||
0.01f);
|
||||
}
|
||||
|
||||
invertState = new FieldInvertState("bar");
|
||||
for (int i = 13; i < 1000; i++) {
|
||||
invertState.setLength(i-12);
|
||||
final byte normD = computeAndGetNorm(d, invertState);
|
||||
invertState.setLength(i);
|
||||
final byte normS = computeAndGetNorm(sp, invertState);
|
||||
final float normD = computeNorm(d, "bar", i-12);
|
||||
final float normS = computeNorm(sp, "bar", i);
|
||||
assertEquals("f: 8,13: 13<x : i="+i,
|
||||
normD,
|
||||
normS,
|
||||
0.0f);
|
||||
0.01f);
|
||||
}
|
||||
|
||||
invertState = new FieldInvertState("yak");
|
||||
for (int i = 9; i < 1000; i++) {
|
||||
invertState.setLength(i-8);
|
||||
final byte normD = computeAndGetNorm(d, invertState);
|
||||
invertState.setLength(i);
|
||||
final byte normS = computeAndGetNorm(sp, invertState);
|
||||
final float normD = computeNorm(d, "yak", i-8);
|
||||
final float normS = computeNorm(sp, "yak", i);
|
||||
assertEquals("f: 6,9: 9<x : i="+i,
|
||||
normD,
|
||||
normS,
|
||||
0.0f);
|
||||
0.01f);
|
||||
}
|
||||
|
||||
|
||||
// steepness
|
||||
|
||||
for (int i = 9; i < 1000; i++) {
|
||||
invertState = new FieldInvertState("a");
|
||||
invertState.setLength(i);
|
||||
final byte normSS = computeAndGetNorm(sp, invertState);
|
||||
invertState = new FieldInvertState("b");
|
||||
invertState.setLength(i);
|
||||
final byte normS = computeAndGetNorm(sp, invertState);
|
||||
final float normSS = computeNorm(sp, "a", i);
|
||||
final float normS = computeNorm(sp, "b", i);
|
||||
assertTrue("s: i="+i+" : a="+normSS+
|
||||
" < b="+normS,
|
||||
normSS < normS);
|
||||
|
|
|
@ -20,19 +20,24 @@ import java.io.IOException;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.queries.function.FunctionValues;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
import org.apache.lucene.queries.function.docvalues.FloatDocValues;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimWeight;
|
||||
|
||||
/**
|
||||
* Function that returns {@link TFIDFSimilarity#decodeNormValue(long)}
|
||||
* for every document.
|
||||
* Function that returns the decoded norm for every document.
|
||||
* <p>
|
||||
* Note that the configured Similarity for the field must be
|
||||
* a subclass of {@link TFIDFSimilarity}
|
||||
* a subclass of {@link TFIDFSimilarity} and the contribution of
|
||||
* the TF needs to be 1 when the freq is 1 and the contribution
|
||||
* of the IDF needs to be 1 when docFreq == docCount == 1.
|
||||
* @lucene.internal */
|
||||
public class NormValueSource extends ValueSource {
|
||||
protected final String field;
|
||||
|
@ -61,11 +66,12 @@ public class NormValueSource extends ValueSource {
|
|||
if (similarity == null) {
|
||||
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
|
||||
}
|
||||
final NumericDocValues norms = readerContext.reader().getNormValues(field);
|
||||
|
||||
if (norms == null) {
|
||||
return new ConstDoubleDocValues(0.0, this);
|
||||
}
|
||||
// Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
|
||||
// is 1 when docCount == docFreq == 1
|
||||
final SimWeight simWeight = similarity.computeWeight(1f,
|
||||
new CollectionStatistics(field, 1, 1, 1, 1),
|
||||
new TermStatistics(new BytesRef("bogus"), 1, 1));
|
||||
final SimScorer simScorer = similarity.simScorer(simWeight, readerContext);
|
||||
|
||||
return new FloatDocValues(this) {
|
||||
int lastDocID = -1;
|
||||
|
@ -74,16 +80,8 @@ public class NormValueSource extends ValueSource {
|
|||
if (docID < lastDocID) {
|
||||
throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
|
||||
}
|
||||
if (docID > norms.docID()) {
|
||||
norms.advance(docID);
|
||||
}
|
||||
long norm;
|
||||
if (docID == norms.docID()) {
|
||||
norm = norms.longValue();
|
||||
} else {
|
||||
norm = 0;
|
||||
}
|
||||
return similarity.decodeNormValue(norm);
|
||||
lastDocID = docID;
|
||||
return simScorer.score(docID, 1f);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
|
@ -33,10 +32,9 @@ import org.apache.lucene.search.ScoreDoc;
|
|||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -48,7 +46,7 @@ public class TestLongNormValueSource extends LuceneTestCase {
|
|||
static IndexSearcher searcher;
|
||||
static Analyzer analyzer;
|
||||
|
||||
private static Similarity sim = new PreciseClassicSimilarity();
|
||||
private static Similarity sim = new ClassicSimilarity();
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
|
@ -116,114 +114,3 @@ public class TestLongNormValueSource extends LuceneTestCase {
|
|||
CheckHits.checkExplanations(q, "", searcher);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Encodes norm as 4-byte float. */
|
||||
class PreciseClassicSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public PreciseClassicSimilarity() {}
|
||||
|
||||
/**
|
||||
* Encodes a normalization factor for storage in an index.
|
||||
* <p>
|
||||
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||
* represent are rounded down to the largest representable value. Positive
|
||||
* values too small to represent are rounded up to the smallest positive
|
||||
* representable value.
|
||||
*
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
@Override
|
||||
public final long encodeNormValue(float f) {
|
||||
return Float.floatToIntBits(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the norm value, assuming it is a single byte.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
@Override
|
||||
public final float decodeNormValue(long norm) {
|
||||
return Float.intBitsToFloat((int)norm);
|
||||
}
|
||||
|
||||
/** Implemented as
|
||||
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
|
||||
* #setDiscountOverlaps} is false, else it's {@link
|
||||
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
|
||||
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return (float) (1.0 / Math.sqrt(numTerms));
|
||||
}
|
||||
|
||||
/** Implemented as <code>sqrt(freq)</code>. */
|
||||
@Override
|
||||
public float tf(float freq) {
|
||||
return (float)Math.sqrt(freq);
|
||||
}
|
||||
|
||||
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||
@Override
|
||||
public float sloppyFreq(int distance) {
|
||||
return 1.0f / (distance + 1);
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1</code> */
|
||||
@Override
|
||||
public float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DefaultSimilarity";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -367,7 +367,7 @@ public class TestValueSources extends LuceneTestCase {
|
|||
// no norm field (so agnostic to indexed similarity)
|
||||
searcher.setSimilarity(new ClassicSimilarity());
|
||||
ValueSource vs = new NormValueSource("byte");
|
||||
assertHits(new FunctionQuery(vs), new float[] { 0f, 0f });
|
||||
assertHits(new FunctionQuery(vs), new float[] { 1f, 1f });
|
||||
|
||||
// regardless of whether norms exist, value source exists == 0
|
||||
assertAllExist(vs);
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.NoMergePolicy;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
|
@ -143,9 +142,9 @@ public class TestPayloadScoreQuery extends LuceneTestCase {
|
|||
// check includeSpanScore makes a difference here
|
||||
searcher.setSimilarity(new MultiplyingSimilarity());
|
||||
try {
|
||||
checkQuery(q, new MaxPayloadFunction(), new int[]{ 122, 222 }, new float[]{ 41.802513122558594f, 34.13160705566406f });
|
||||
checkQuery(q, new MinPayloadFunction(), new int[]{ 222, 122 }, new float[]{ 34.13160705566406f, 20.901256561279297f });
|
||||
checkQuery(q, new AveragePayloadFunction(), new int[] { 122, 222 }, new float[]{ 38.3189697265625f, 34.13160705566406f });
|
||||
checkQuery(q, new MaxPayloadFunction(), new int[]{ 122, 222 }, new float[]{ 20.901256561279297f, 17.06580352783203f });
|
||||
checkQuery(q, new MinPayloadFunction(), new int[]{ 222, 122 }, new float[]{ 17.06580352783203f, 10.450628280639648f });
|
||||
checkQuery(q, new AveragePayloadFunction(), new int[] { 122, 222 }, new float[]{ 19.15948486328125f, 17.06580352783203f });
|
||||
checkQuery(q, new MaxPayloadFunction(), false, new int[]{122, 222}, new float[]{4.0f, 4.0f});
|
||||
checkQuery(q, new MinPayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 2.0f});
|
||||
checkQuery(q, new AveragePayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 3.666666f});
|
||||
|
@ -298,7 +297,7 @@ public class TestPayloadScoreQuery extends LuceneTestCase {
|
|||
//Make everything else 1 so we see the effect of the payload
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public float lengthNorm(int length) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -268,7 +267,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
|
|||
//Make everything else 1 so we see the effect of the payload
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public float lengthNorm(int length) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ import java.util.Random;
|
|||
* for the same field.
|
||||
*/
|
||||
public class RandomSimilarity extends PerFieldSimilarityWrapper {
|
||||
final ClassicSimilarity defaultSim = new ClassicSimilarity();
|
||||
final BM25Similarity defaultSim = new BM25Similarity();
|
||||
final List<Similarity> knownSims;
|
||||
Map<String,Similarity> previousMappings = new HashMap<>();
|
||||
final int perFieldSeed;
|
||||
|
|
|
@ -86,8 +86,8 @@ public class DisMaxRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
req("cool stuff")
|
||||
,"//*[@numFound='3']"
|
||||
,"//result/doc[1]/int[@name='id'][.='42']"
|
||||
,"//result/doc[2]/int[@name='id'][.='8675309']"
|
||||
,"//result/doc[3]/int[@name='id'][.='666']"
|
||||
,"//result/doc[2]/int[@name='id'][.='666']"
|
||||
,"//result/doc[3]/int[@name='id'][.='8675309']"
|
||||
);
|
||||
|
||||
assertQ("multi qf",
|
||||
|
|
|
@ -97,8 +97,8 @@ public class QueryElevationComponentTest extends SolrTestCaseJ4 {
|
|||
CommonParams.FL, "id, score, [elevated]")
|
||||
, "//*[@numFound='3']"
|
||||
, "//result/doc[1]/float[@name='id'][.='7.0']"
|
||||
, "//result/doc[2]/float[@name='id'][.='8.0']"
|
||||
, "//result/doc[3]/float[@name='id'][.='9.0']",
|
||||
, "//result/doc[2]/float[@name='id'][.='9.0']"
|
||||
, "//result/doc[3]/float[@name='id'][.='8.0']",
|
||||
"//result/doc[1]/bool[@name='[elevated]'][.='true']",
|
||||
"//result/doc[2]/bool[@name='[elevated]'][.='false']",
|
||||
"//result/doc[3]/bool[@name='[elevated]'][.='false']"
|
||||
|
|
|
@ -49,6 +49,6 @@ public class TestPayloadScoreQParserPlugin extends SolrTestCaseJ4 {
|
|||
|
||||
// TODO: fix this includeSpanScore test to be less brittle - score result is score of "A" (via BM25) multipled by 1.0 (payload value)
|
||||
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min}"), "//float[@name='score']='1.0'");
|
||||
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.25811607'");
|
||||
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.2876821'");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,9 +65,9 @@ public class SortByFunctionTest extends AbstractSolrTestCase {
|
|||
assertQ(req("fl", "id,score", "q", "f_t:ipod", "sort", "score desc"),
|
||||
"//*[@numFound='4']",
|
||||
"//result/doc[1]/int[@name='id'][.='1']",
|
||||
"//result/doc[2]/int[@name='id'][.='4']",
|
||||
"//result/doc[3]/int[@name='id'][.='2']",
|
||||
"//result/doc[4]/int[@name='id'][.='3']"
|
||||
"//result/doc[2]/int[@name='id'][.='2']",
|
||||
"//result/doc[3]/int[@name='id'][.='3']",
|
||||
"//result/doc[4]/int[@name='id'][.='4']"
|
||||
);
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
|
@ -431,12 +430,8 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
|
|||
assertQ(req("fl","*,score","q", "{!func}tf(a_tfidf,cow)", "fq","id:6"),
|
||||
"//float[@name='score']='" + similarity.tf(5) + "'");
|
||||
|
||||
FieldInvertState state = new FieldInvertState("a_tfidf");
|
||||
state.setLength(4);
|
||||
long norm = similarity.computeNorm(state);
|
||||
float nrm = similarity.decodeNormValue((byte) norm);
|
||||
assertQ(req("fl","*,score","q", "{!func}norm(a_tfidf)", "fq","id:2"),
|
||||
"//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte
|
||||
"//float[@name='score']='0.5'"); // 1/sqrt(4)==1/2==0.5
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -16,8 +16,22 @@
|
|||
*/
|
||||
package org.apache.solr.search.similarities;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.misc.SweetSpotSimilarity;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
|
@ -28,7 +42,38 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
|
|||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-basic.xml","schema-sweetspot.xml");
|
||||
}
|
||||
|
||||
|
||||
private static float computeNorm(Similarity sim, int length) throws IOException {
|
||||
String value = IntStream.range(0, length).mapToObj(i -> "a").collect(Collectors.joining(" "));
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(sim));
|
||||
w.addDocument(Collections.singleton(newTextField("foo", value, Store.NO)));
|
||||
DirectoryReader reader = DirectoryReader.open(w);
|
||||
w.close();
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
searcher.setSimilarity(sim);
|
||||
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "a")), 0);
|
||||
reader.close();
|
||||
dir.close();
|
||||
Explanation norm = findExplanation(expl, "fieldNorm");
|
||||
assertNotNull(norm);
|
||||
return norm.getValue();
|
||||
}
|
||||
|
||||
private static Explanation findExplanation(Explanation expl, String text) {
|
||||
if (expl.getDescription().startsWith(text)) {
|
||||
return expl;
|
||||
} else {
|
||||
for (Explanation sub : expl.getDetails()) {
|
||||
Explanation match = findExplanation(sub, text);
|
||||
if (match != null) {
|
||||
return match;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** default parameters */
|
||||
public void testDefaults() throws Exception {
|
||||
SweetSpotSimilarity sim = getSimilarity("text", SweetSpotSimilarity.class);
|
||||
|
@ -40,9 +85,9 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
|
|||
}
|
||||
|
||||
// default norm sanity check
|
||||
assertEquals("norm 1", 1.00F, sim.computeLengthNorm(1), 0.0F);
|
||||
assertEquals("norm 4", 0.50F, sim.computeLengthNorm(4), 0.0F);
|
||||
assertEquals("norm 16", 0.25F, sim.computeLengthNorm(16), 0.0F);
|
||||
assertEquals("norm 1", 1.00F, computeNorm(sim, 1), 0.0F);
|
||||
assertEquals("norm 4", 0.50F, computeNorm(sim, 4), 0.0F);
|
||||
assertEquals("norm 16", 0.25F, computeNorm(sim, 16), 0.0F);
|
||||
}
|
||||
|
||||
/** baseline with parameters */
|
||||
|
@ -65,17 +110,17 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
|
|||
|
||||
// norms: plateau from 3-5
|
||||
assertEquals("norm 1 == 7",
|
||||
sim.computeLengthNorm(1), sim.computeLengthNorm(7), 0.0F);
|
||||
computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
|
||||
assertEquals("norm 2 == 6",
|
||||
sim.computeLengthNorm(1), sim.computeLengthNorm(7), 0.0F);
|
||||
assertEquals("norm 3", 1.00F, sim.computeLengthNorm(3), 0.0F);
|
||||
assertEquals("norm 4", 1.00F, sim.computeLengthNorm(4), 0.0F);
|
||||
assertEquals("norm 5", 1.00F, sim.computeLengthNorm(5), 0.0F);
|
||||
assertTrue("norm 6 too high: " + sim.computeLengthNorm(6),
|
||||
sim.computeLengthNorm(6) < 1.0F);
|
||||
computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
|
||||
assertEquals("norm 3", 1.00F, computeNorm(sim, 3), 0.0F);
|
||||
assertEquals("norm 4", 1.00F, computeNorm(sim, 4), 0.0F);
|
||||
assertEquals("norm 5", 1.00F, computeNorm(sim, 5), 0.0F);
|
||||
assertTrue("norm 6 too high: " + computeNorm(sim, 6),
|
||||
computeNorm(sim, 6) < 1.0F);
|
||||
assertTrue("norm 7 higher then norm 6",
|
||||
sim.computeLengthNorm(7) < sim.computeLengthNorm(6));
|
||||
assertEquals("norm 20", 0.25F, sim.computeLengthNorm(20), 0.0F);
|
||||
computeNorm(sim, 7) < computeNorm(sim, 6));
|
||||
assertEquals("norm 20", 0.25F, computeNorm(sim, 20), 0.0F);
|
||||
}
|
||||
|
||||
/** hyperbolic with parameters */
|
||||
|
@ -92,16 +137,16 @@ public class TestSweetSpotSimilarityFactory extends BaseSimilarityTestCase {
|
|||
assertEquals("MID tf", 3.3F+(7.7F - 3.3F)/2.0F, sim.tf(5), 0.00001F);
|
||||
|
||||
// norms: plateau from 1-5, shallow slope
|
||||
assertEquals("norm 1", 1.00F, sim.computeLengthNorm(1), 0.0F);
|
||||
assertEquals("norm 2", 1.00F, sim.computeLengthNorm(2), 0.0F);
|
||||
assertEquals("norm 3", 1.00F, sim.computeLengthNorm(3), 0.0F);
|
||||
assertEquals("norm 4", 1.00F, sim.computeLengthNorm(4), 0.0F);
|
||||
assertEquals("norm 5", 1.00F, sim.computeLengthNorm(5), 0.0F);
|
||||
assertTrue("norm 6 too high: " + sim.computeLengthNorm(6),
|
||||
sim.computeLengthNorm(6) < 1.0F);
|
||||
assertEquals("norm 1", 1.00F, computeNorm(sim, 1), 0.0F);
|
||||
assertEquals("norm 2", 1.00F, computeNorm(sim, 2), 0.0F);
|
||||
assertEquals("norm 3", 1.00F, computeNorm(sim, 3), 0.0F);
|
||||
assertEquals("norm 4", 1.00F, computeNorm(sim, 4), 0.0F);
|
||||
assertEquals("norm 5", 1.00F, computeNorm(sim, 5), 0.0F);
|
||||
assertTrue("norm 6 too high: " + computeNorm(sim, 6),
|
||||
computeNorm(sim, 6) < 1.0F);
|
||||
assertTrue("norm 7 higher then norm 6",
|
||||
sim.computeLengthNorm(7) < sim.computeLengthNorm(6));
|
||||
assertTrue("norm 20 not high enough: " + sim.computeLengthNorm(20),
|
||||
0.25F < sim.computeLengthNorm(20));
|
||||
computeNorm(sim, 7) < computeNorm(sim, 6));
|
||||
assertTrue("norm 20 not high enough: " + computeNorm(sim, 20),
|
||||
0.25F < computeNorm(sim, 20));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue