LUCENE-5078: Allow TfIDFSimilarity implementations to encode norm values into more than a single byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1496579 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-06-25 18:39:28 +00:00
parent 450534ff3a
commit 3cf2f34181
9 changed files with 146 additions and 81 deletions

View File

@ -109,6 +109,11 @@ Changes in backwards compatibility policy
Lucene has much better support for numeric data through (Int|Long)Field,
NumericRangeQuery and FieldCache.get(Int|Long)s. (Adrien Grand)
* LUCENE-5078: TfIDFSimilarity lets you encode the norm value as any arbitrary long.
As a result, encode/decodeNormValue were made abstract with their signatures changed.
The default implementation was moved to DefaultSimilarity, which encodes the norm as
a single-byte value. (Shai Erera)
Bug Fixes
* LUCENE-4890: QueryTreeBuilder.getBuilder() only finds interfaces on the

View File

@ -19,10 +19,40 @@ package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/** Expert: Default scoring implementation. */
/**
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
* encodes} norm values as a single byte before being stored. At search time,
* the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
* instance, <i>decode(encode(0.89)) = 0.75</i>.
* <p>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for all
* documents - are maintained in memory.
* <p>
* The rationale supporting such lossy compression of norm values is that given
* the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter. <br>
* &nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of
* scoring, e.g. by using a different {@link Similarity} for search.
*/
public class DefaultSimilarity extends TFIDFSimilarity {
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/** Sole constructor: parameter-free */
public DefaultSimilarity() {}
@ -38,6 +68,35 @@ public class DefaultSimilarity extends TFIDFSimilarity {
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link

View File

@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/**
@ -496,27 +495,8 @@ import org.apache.lucene.util.SmallFloat;
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* Note that search time is too late to modify this <i>norm</i> part of scoring,
* e.g. by using a different {@link Similarity} for search.
* </li>
* </ol>
*
@ -666,38 +646,15 @@ public abstract class TFIDFSimilarity extends Similarity {
return encodeNormValue(normValue);
}
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/** Decodes a normalization factor stored in an index.
/**
* Decodes a normalization factor stored in an index.
*
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
public abstract float decodeNormValue(long norm);
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/** Encodes a normalization factor for storage in an index. */
public abstract long encodeNormValue(float f);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
@ -756,7 +713,7 @@ public abstract class TFIDFSimilarity extends Similarity {
public float score(int doc, float freq) {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize for field
}
@Override
@ -843,8 +800,7 @@ public abstract class TFIDFSimilarity extends Similarity {
fieldExpl.addDetail(stats.idf);
Explanation fieldNormExpl = new Explanation();
float fieldNorm =
norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f;
float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);

View File

@ -26,8 +26,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -97,21 +98,28 @@ public class TestMaxTermFrequency extends LuceneTestCase {
/**
* Simple similarity that encodes maxTermFrequency directly as a byte
*/
class TestSimilarity extends DefaultSimilarity {
@Override
public byte encodeNormValue(float f) {
return (byte) f;
}
@Override
public float decodeNormValue(byte b) {
return (float) b;
}
class TestSimilarity extends TFIDFSimilarity {
@Override
public float lengthNorm(FieldInvertState state) {
return state.getMaxTermFrequency();
}
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override public float coord(int overlap, int maxOverlap) { return 0; }
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long numDocs) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
}

View File

@ -29,7 +29,9 @@ import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@ -45,21 +47,29 @@ import org.apache.lucene.util._TestUtil;
public class TestNorms extends LuceneTestCase {
final String byteTestField = "normsTestByte";
class CustomNormEncodingSimilarity extends DefaultSimilarity {
class CustomNormEncodingSimilarity extends TFIDFSimilarity {
@Override
public byte encodeNormValue(float f) {
return (byte) f;
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(byte b) {
return (float) b;
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float lengthNorm(FieldInvertState state) {
return state.getLength();
}
@Override public float coord(int overlap, int maxOverlap) { return 0; }
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long numDocs) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
// LUCENE-1260

View File

@ -18,26 +18,35 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.*;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestOmitTf extends LuceneTestCase {
public static class SimpleSimilarity extends TFIDFSimilarity {
@Override public float decodeNormValue(long norm) { return norm; }
@Override public long encodeNormValue(float f) { return (long) f; }
@Override
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
@Override

View File

@ -107,6 +107,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
private class Sim1 extends TFIDFSimilarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float coord(int overlap, int maxOverlap) {
return 1f;
@ -145,6 +155,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
private class Sim2 extends TFIDFSimilarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float coord(int overlap, int maxOverlap) {
return 1f;

View File

@ -29,7 +29,7 @@ import java.io.IOException;
import java.util.Map;
/**
* Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)}
* Function that returns {@link TFIDFSimilarity#decodeNormValue(long)}
* for every document.
* <p>
* Note that the configured Similarity for the field must be

View File

@ -357,9 +357,7 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
*/
private static byte expectedNorm(final DefaultSimilarity sim,
final int length, final float boost) {
return sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
return (byte) sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
}