LUCENE-5078: Allow TfIDFSimilarity implementations to encode norm values into more than a single byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1496579 13f79535-47bb-0310-9956-ffa450edef68
2013-06-25 18:39:28 +00:00 · 2013-06-25 18:39:28 +00:00 · 3cf2f34181
parent 450534ff3a
commit 3cf2f34181
9 changed files with 146 additions and 81 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -109,6 +109,11 @@ Changes in backwards compatibility policy
  Lucene has much better support for numeric data through (Int|Long)Field,
  NumericRangeQuery and FieldCache.get(Int|Long)s. (Adrien Grand)

+* LUCENE-5078: TfIDFSimilarity lets you encode the norm value as any arbitrary long.
+  As a result, encode/decodeNormValue were made abstract with their signatures changed.
+  The default implementation was moved to DefaultSimilarity, which encodes the norm as
+  a single-byte value. (Shai Erera)
+
 Bug Fixes

 * LUCENE-4890: QueryTreeBuilder.getBuilder() only finds interfaces on the 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
@ -19,10 +19,40 @@ package org.apache.lucene.search.similarities;

 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;

-/** Expert: Default scoring implementation. */
+/**
+ * Expert: Default scoring implementation which {@link #encodeNormValue(float)
+ * encodes} norm values as a single byte before being stored. At search time,
+ * the norm byte value is read from the index
+ * {@link org.apache.lucene.store.Directory directory} and
+ * {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
+ * This encoding/decoding, while reducing index size, comes with the price of
+ * precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
+ * instance, <i>decode(encode(0.89)) = 0.75</i>.
+ * <p>
+ * Compression of norm values to a single byte saves memory at search time,
+ * because once a field is referenced at search time, its norms - for all
+ * documents - are maintained in memory.
+ * <p>
+ * The rationale supporting such lossy compression of norm values is that given
+ * the difficulty (and inaccuracy) of users to express their true information
+ * need by a query, only big differences matter. <br>
+ * &nbsp;<br>
+ * Last, note that search time is too late to modify this <i>norm</i> part of
+ * scoring, e.g. by using a different {@link Similarity} for search.
+ */
 public class DefaultSimilarity extends TFIDFSimilarity {
  
+  /** Cache of decoded bytes. */
+  private static final float[] NORM_TABLE = new float[256];
+
+  static {
+    for (int i = 0; i < 256; i++) {
+      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+    }
+  }
+
  /** Sole constructor: parameter-free */
  public DefaultSimilarity() {}
  
@ -38,6 +68,35 @@ public class DefaultSimilarity extends TFIDFSimilarity {
    return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
  }
  
+  /**
+   * Encodes a normalization factor for storage in an index.
+   * <p>
+   * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+   * zero-exponent point at 15, thus representing values from around 7x10^9 to
+   * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+   * represented. Negative numbers are rounded up to zero. Values too large to
+   * represent are rounded down to the largest representable value. Positive
+   * values too small to represent are rounded up to the smallest positive
+   * representable value.
+   * 
+   * @see org.apache.lucene.document.Field#setBoost(float)
+   * @see org.apache.lucene.util.SmallFloat
+   */
+  @Override
+  public final long encodeNormValue(float f) {
+    return SmallFloat.floatToByte315(f);
+  }
+
+  /**
+   * Decodes the norm value, assuming it is a single byte.
+   * 
+   * @see #encodeNormValue(float)
+   */
+  @Override
+  public final float decodeNormValue(long norm) {
+    return NORM_TABLE[(int) (norm & 0xFF)];  // & 0xFF maps negative bytes to positive above 127
+  }
+
  /** Implemented as
   *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
   *  <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@ -28,7 +28,6 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;


 /**
@ -496,27 +495,8 @@ import org.apache.lucene.util.SmallFloat;
 *          <td></td>
 *        </tr>
 *      </table>
- *      <br>&nbsp;<br>
- *      However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
- *      before being stored.
- *      At search time, the norm byte value is read from the index
- *      {@link org.apache.lucene.store.Directory directory} and
- *      {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
- *      This encoding/decoding, while reducing index size, comes with the price of
- *      precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
- *      For instance, <i>decode(encode(0.89)) = 0.75</i>.
- *      <br>&nbsp;<br>
- *      Compression of norm values to a single byte saves memory at search time, 
- *      because once a field is referenced at search time, its norms - for 
- *      all documents - are maintained in memory.
- *      <br>&nbsp;<br>
- *      The rationale supporting such lossy compression of norm values is that
- *      given the difficulty (and inaccuracy) of users to express their true information
- *      need by a query, only big differences matter.
- *      <br>&nbsp;<br>
- *      Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
- *      using a different {@link Similarity} for search.
- *      <br>&nbsp;<br>
+ *      Note that search time is too late to modify this <i>norm</i> part of scoring, 
+ *      e.g. by using a different {@link Similarity} for search.
 *    </li>
 * </ol>
 *
@ -666,38 +646,15 @@ public abstract class TFIDFSimilarity extends Similarity {
    return encodeNormValue(normValue);
  }
  
-  /** Cache of decoded bytes. */
-  private static final float[] NORM_TABLE = new float[256];
-
-  static {
-    for (int i = 0; i < 256; i++) {
-      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
-    }
-  }
-
-  /** Decodes a normalization factor stored in an index.
+  /**
+   * Decodes a normalization factor stored in an index.
+   * 
   * @see #encodeNormValue(float)
   */
-  public float decodeNormValue(byte b) {
-    return NORM_TABLE[b & 0xFF];  // & 0xFF maps negative bytes to positive above 127
-  }
+  public abstract float decodeNormValue(long norm);

-  /** Encodes a normalization factor for storage in an index.
-  *
-  * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
-  * the zero-exponent point at 15, thus
-  * representing values from around 7x10^9 to 2x10^-9 with about one
-  * significant decimal digit of accuracy.  Zero is also represented.
-  * Negative numbers are rounded up to zero.  Values too large to represent
-  * are rounded down to the largest representable value.  Positive values too
-  * small to represent are rounded up to the smallest positive representable
-  * value.
-  * @see org.apache.lucene.document.Field#setBoost(float)
-  * @see org.apache.lucene.util.SmallFloat
-  */
-  public byte encodeNormValue(float f) {
-    return SmallFloat.floatToByte315(f);
-  }
+  /** Encodes a normalization factor for storage in an index. */
+  public abstract long encodeNormValue(float f);
 
  /** Computes the amount of a sloppy phrase match, based on an edit distance.
   * This value is summed for each sloppy phrase match in a document to form
@ -756,7 +713,7 @@ public abstract class TFIDFSimilarity extends Similarity {
    public float score(int doc, float freq) {
      final float raw = tf(freq) * weightValue; // compute tf(f)*weight
      
-      return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc));  // normalize for field
+      return norms == null ? raw : raw * decodeNormValue(norms.get(doc));  // normalize for field
    }
    
    @Override
@ -843,8 +800,7 @@ public abstract class TFIDFSimilarity extends Similarity {
    fieldExpl.addDetail(stats.idf);

    Explanation fieldNormExpl = new Explanation();
-    float fieldNorm =
-      norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f;
+    float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f;
    fieldNormExpl.setValue(fieldNorm);
    fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
    fieldExpl.addDetail(fieldNormExpl);
--- a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
@ -26,8 +26,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;

@ -97,21 +98,28 @@ public class TestMaxTermFrequency extends LuceneTestCase {
  /**
   * Simple similarity that encodes maxTermFrequency directly as a byte
   */
-  class TestSimilarity extends DefaultSimilarity {
-
-    @Override
-    public byte encodeNormValue(float f) {
-      return (byte) f;
-    }
-    
-    @Override
-    public float decodeNormValue(byte b) {
-      return (float) b;
-    }
+  class TestSimilarity extends TFIDFSimilarity {

    @Override
    public float lengthNorm(FieldInvertState state) {
      return state.getMaxTermFrequency();
    }
+
+    @Override
+    public long encodeNormValue(float f) {
+      return (long) f;
+    }
+
+    @Override
+    public float decodeNormValue(long norm) {
+      return norm;
+    }
+
+    @Override public float coord(int overlap, int maxOverlap) { return 0; }
+    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
+    @Override public float tf(float freq) { return 0; }
+    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float sloppyFreq(int distance) { return 0; }
+    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
@ -29,7 +29,9 @@ import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.similarities.DefaultSimilarity;
 import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
 import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LineFileDocs;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@ -45,21 +47,29 @@ import org.apache.lucene.util._TestUtil;
 public class TestNorms extends LuceneTestCase {
  final String byteTestField = "normsTestByte";

-  class CustomNormEncodingSimilarity extends DefaultSimilarity {
+  class CustomNormEncodingSimilarity extends TFIDFSimilarity {
+
    @Override
-    public byte encodeNormValue(float f) {
-      return (byte) f;
+    public long encodeNormValue(float f) {
+      return (long) f;
    }
    
    @Override
-    public float decodeNormValue(byte b) {
-      return (float) b;
+    public float decodeNormValue(long norm) {
+      return norm;
    }

    @Override
    public float lengthNorm(FieldInvertState state) {
      return state.getLength();
    }
+
+    @Override public float coord(int overlap, int maxOverlap) { return 0; }
+    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
+    @Override public float tf(float freq) { return 0; }
+    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float sloppyFreq(int distance) { return 0; }
+    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
  
  // LUCENE-1260
--- a/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
@ -18,26 +18,35 @@ package org.apache.lucene.index;
 */

 import java.io.IOException;
-import java.util.concurrent.ExecutionException;

-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
-import org.apache.lucene.search.*;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;


 public class TestOmitTf extends LuceneTestCase {
  
  public static class SimpleSimilarity extends TFIDFSimilarity {
+    @Override public float decodeNormValue(long norm) { return norm; }
+    @Override public long encodeNormValue(float f) { return (long) f; }
    @Override
    public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
    @Override
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
@ -107,6 +107,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
  
  private class Sim1 extends TFIDFSimilarity {
    
+    @Override
+    public long encodeNormValue(float f) {
+      return (long) f;
+    }
+    
+    @Override
+    public float decodeNormValue(long norm) {
+      return norm;
+    }
+    
    @Override
    public float coord(int overlap, int maxOverlap) {
      return 1f;
@ -145,6 +155,16 @@ public class TestSimilarityProvider extends LuceneTestCase {
  
  private class Sim2 extends TFIDFSimilarity {
    
+    @Override
+    public long encodeNormValue(float f) {
+      return (long) f;
+    }
+    
+    @Override
+    public float decodeNormValue(long norm) {
+      return norm;
+    }
+    
    @Override
    public float coord(int overlap, int maxOverlap) {
      return 1f;
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
@ -29,7 +29,7 @@ import java.io.IOException;
 import java.util.Map;

 /** 
- * Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)}
+ * Function that returns {@link TFIDFSimilarity#decodeNormValue(long)}
 * for every document.
 * <p>
 * Note that the configured Similarity for the field must be
--- a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
+++ b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
@ -357,9 +357,7 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
   */
  private static byte expectedNorm(final DefaultSimilarity sim,
                                   final int length, final float boost) {
-    
-    return sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
-
+    return (byte) sim.encodeNormValue(boost / ((float) Math.sqrt(length)));
  }