mirror of https://github.com/apache/lucene.git
LUCENE-5078: Allow TfIDFSimilarity implementations to encode norm values into more than a single byte
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1496579 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
450534ff3a
commit
3cf2f34181
|
@ -109,6 +109,11 @@ Changes in backwards compatibility policy
|
|||
Lucene has much better support for numeric data through (Int|Long)Field,
|
||||
NumericRangeQuery and FieldCache.get(Int|Long)s. (Adrien Grand)
|
||||
|
||||
* LUCENE-5078: TfIDFSimilarity lets you encode the norm value as any arbitrary long.
|
||||
As a result, encode/decodeNormValue were made abstract with their signatures changed.
|
||||
The default implementation was moved to DefaultSimilarity, which encodes the norm as
|
||||
a single-byte value. (Shai Erera)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4890: QueryTreeBuilder.getBuilder() only finds interfaces on the
|
||||
|
|
|
@ -19,10 +19,40 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
/** Expert: Default scoring implementation. */
|
||||
/**
|
||||
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
|
||||
* encodes} norm values as a single byte before being stored. At search time,
|
||||
* the norm byte value is read from the index
|
||||
* {@link org.apache.lucene.store.Directory directory} and
|
||||
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
|
||||
* This encoding/decoding, while reducing index size, comes with the price of
|
||||
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
|
||||
* instance, <i>decode(encode(0.89)) = 0.75</i>.
|
||||
* <p>
|
||||
* Compression of norm values to a single byte saves memory at search time,
|
||||
* because once a field is referenced at search time, its norms - for all
|
||||
* documents - are maintained in memory.
|
||||
* <p>
|
||||
* The rationale supporting such lossy compression of norm values is that given
|
||||
* the difficulty (and inaccuracy) of users to express their true information
|
||||
* need by a query, only big differences matter. <br>
|
||||
* <br>
|
||||
* Last, note that search time is too late to modify this <i>norm</i> part of
|
||||
* scoring, e.g. by using a different {@link Similarity} for search.
|
||||
*/
|
||||
public class DefaultSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public DefaultSimilarity() {}
|
||||
|
||||
|
@ -38,6 +68,35 @@ public class DefaultSimilarity extends TFIDFSimilarity {
|
|||
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a normalization factor for storage in an index.
|
||||
* <p>
|
||||
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||
* represent are rounded down to the largest representable value. Positive
|
||||
* values too small to represent are rounded up to the smallest positive
|
||||
* representable value.
|
||||
*
|
||||
* @see org.apache.lucene.document.Field#setBoost(float)
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
@Override
|
||||
public final long encodeNormValue(float f) {
|
||||
return SmallFloat.floatToByte315(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the norm value, assuming it is a single byte.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
@Override
|
||||
public final float decodeNormValue(long norm) {
|
||||
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Implemented as
|
||||
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -496,27 +495,8 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <td></td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <br> <br>
|
||||
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
|
||||
* before being stored.
|
||||
* At search time, the norm byte value is read from the index
|
||||
* {@link org.apache.lucene.store.Directory directory} and
|
||||
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
|
||||
* This encoding/decoding, while reducing index size, comes with the price of
|
||||
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
|
||||
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
|
||||
* <br> <br>
|
||||
* Compression of norm values to a single byte saves memory at search time,
|
||||
* because once a field is referenced at search time, its norms - for
|
||||
* all documents - are maintained in memory.
|
||||
* <br> <br>
|
||||
* The rationale supporting such lossy compression of norm values is that
|
||||
* given the difficulty (and inaccuracy) of users to express their true information
|
||||
* need by a query, only big differences matter.
|
||||
* <br> <br>
|
||||
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
|
||||
* using a different {@link Similarity} for search.
|
||||
* <br> <br>
|
||||
* Note that search time is too late to modify this <i>norm</i> part of scoring,
|
||||
* e.g. by using a different {@link Similarity} for search.
|
||||
* </li>
|
||||
* </ol>
|
||||
*
|
||||
|
@ -666,38 +646,15 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
return encodeNormValue(normValue);
|
||||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decodes a normalization factor stored in an index.
|
||||
/**
|
||||
* Decodes a normalization factor stored in an index.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
public float decodeNormValue(byte b) {
|
||||
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
public abstract float decodeNormValue(long norm);
|
||||
|
||||
/** Encodes a normalization factor for storage in an index.
|
||||
*
|
||||
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
|
||||
* the zero-exponent point at 15, thus
|
||||
* representing values from around 7x10^9 to 2x10^-9 with about one
|
||||
* significant decimal digit of accuracy. Zero is also represented.
|
||||
* Negative numbers are rounded up to zero. Values too large to represent
|
||||
* are rounded down to the largest representable value. Positive values too
|
||||
* small to represent are rounded up to the smallest positive representable
|
||||
* value.
|
||||
* @see org.apache.lucene.document.Field#setBoost(float)
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
public byte encodeNormValue(float f) {
|
||||
return SmallFloat.floatToByte315(f);
|
||||
}
|
||||
/** Encodes a normalization factor for storage in an index. */
|
||||
public abstract long encodeNormValue(float f);
|
||||
|
||||
/** Computes the amount of a sloppy phrase match, based on an edit distance.
|
||||
* This value is summed for each sloppy phrase match in a document to form
|
||||
|
@ -756,7 +713,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
public float score(int doc, float freq) {
|
||||
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
|
||||
|
||||
return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
|
||||
return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize for field
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -843,8 +800,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
fieldExpl.addDetail(stats.idf);
|
||||
|
||||
Explanation fieldNormExpl = new Explanation();
|
||||
float fieldNorm =
|
||||
norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f;
|
||||
float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f;
|
||||
fieldNormExpl.setValue(fieldNorm);
|
||||
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
|
||||
fieldExpl.addDetail(fieldNormExpl);
|
||||
|
|
|
@ -26,8 +26,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
|
@ -97,21 +98,28 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
|||
/**
|
||||
* Simple similarity that encodes maxTermFrequency directly as a byte
|
||||
*/
|
||||
class TestSimilarity extends DefaultSimilarity {
|
||||
|
||||
@Override
|
||||
public byte encodeNormValue(float f) {
|
||||
return (byte) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(byte b) {
|
||||
return (float) b;
|
||||
}
|
||||
class TestSimilarity extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return state.getMaxTermFrequency();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override public float coord(int overlap, int maxOverlap) { return 0; }
|
||||
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,9 @@ import org.apache.lucene.search.TermStatistics;
|
|||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
|
@ -45,21 +47,29 @@ import org.apache.lucene.util._TestUtil;
|
|||
public class TestNorms extends LuceneTestCase {
|
||||
final String byteTestField = "normsTestByte";
|
||||
|
||||
class CustomNormEncodingSimilarity extends DefaultSimilarity {
|
||||
class CustomNormEncodingSimilarity extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public byte encodeNormValue(float f) {
|
||||
return (byte) f;
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(byte b) {
|
||||
return (float) b;
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return state.getLength();
|
||||
}
|
||||
|
||||
@Override public float coord(int overlap, int maxOverlap) { return 0; }
|
||||
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
|
||||
// LUCENE-1260
|
||||
|
|
|
@ -18,26 +18,35 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
||||
public class TestOmitTf extends LuceneTestCase {
|
||||
|
||||
public static class SimpleSimilarity extends TFIDFSimilarity {
|
||||
@Override public float decodeNormValue(long norm) { return norm; }
|
||||
@Override public long encodeNormValue(float f) { return (long) f; }
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
|
||||
@Override
|
||||
|
|
|
@ -107,6 +107,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
|
||||
private class Sim1 extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return 1f;
|
||||
|
@ -145,6 +155,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
|
||||
private class Sim2 extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return 1f;
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.io.IOException;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)}
|
||||
* Function that returns {@link TFIDFSimilarity#decodeNormValue(long)}
|
||||
* for every document.
|
||||
* <p>
|
||||
* Note that the configured Similarity for the field must be
|
||||
|
|
|
@ -357,9 +357,7 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
|
|||
*/
|
||||
private static byte expectedNorm(final DefaultSimilarity sim,
|
||||
final int length, final float boost) {
|
||||
|
||||
return sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
|
||||
|
||||
return (byte) sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue