From f335512f02ed59ab68115ef837fa133cb40ac2eb Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Fri, 31 Jan 2014 12:16:31 +0000
Subject: [PATCH] LUCENE-5398: remove invalid byte cast in NormValueSource,
 since TFIDFSimilarity now allows for norms larger than one byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |   4 +
 .../org/apache/lucene/document/Document.java  |  14 +-
 .../function/valuesource/NormValueSource.java |   6 +-
 .../function/TestLongNormValueSource.java     | 237 ++++++++++++++++++
 4 files changed, 254 insertions(+), 7 deletions(-)
 create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 14c5abbdb2e..58629fb2427 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -217,6 +217,10 @@ Bug fixes
   return any groups when the joined query required more than one
   rewrite step (Peng Cheng via Mike McCandless)
 
+* LUCENE-5398: NormValueSource was incorrectly casting the long value
+  to byte, before calling Similarity.decodeNormValue.  (Peng Cheng via
+  Mike McCandless)
+
 API Changes
 
 * LUCENE-5339: The facet module was simplified/reworked to make the
diff --git a/lucene/core/src/java/org/apache/lucene/document/Document.java b/lucene/core/src/java/org/apache/lucene/document/Document.java
index b7e86e10bcc..1ce27aca6a0 100644
--- a/lucene/core/src/java/org/apache/lucene/document/Document.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Document.java
@@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
       Field newField = new Field(field.name(), (FieldType) field.fieldType());
      
       newField.fieldsData = field.stringValue();
-      if (newField.fieldsData == null) 
+      if (newField.fieldsData == null) {
         newField.fieldsData = field.numericValue();
-      if (newField.fieldsData == null) 
+      }
+      if (newField.fieldsData == null) {
         newField.fieldsData = field.binaryValue();
-      if (newField.fieldsData == null) 
+      }
+      if (newField.fieldsData == null) {
         newField.fieldsData = field.readerValue();
+      }
      
       add(newField);
     }
- }
+  }
 
   
   /**
@@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
     for (int i = 0; i < fields.size(); i++) {
       IndexableField field = fields.get(i);
       buffer.append(field.toString());
-      if (i != fields.size()-1)
+      if (i != fields.size()-1) {
         buffer.append(" ");
+      }
     }
     buffer.append(">");
     return buffer.toString();
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
index c6b86aedce1..81e4067bec7 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
@@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
     return new FloatDocValues(this) {
       @Override
       public float floatVal(int doc) {
-        return similarity.decodeNormValue((byte)norms.get(doc));
+        return similarity.decodeNormValue(norms.get(doc));
       }
     };
   }
 
   @Override
   public boolean equals(Object o) {
-    if (this.getClass() != o.getClass()) return false;
+    if (this.getClass() != o.getClass()) {
+      return false;
+    }
     return this.field.equals(((NormValueSource)o).field);
   }
 
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
new file mode 100644
index 00000000000..c4be0247e2e
--- /dev/null
+++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
@@ -0,0 +1,237 @@
+package org.apache.lucene.queries.function;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.queries.function.valuesource.NormValueSource;
+import org.apache.lucene.search.CheckHits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+public class TestLongNormValueSource extends LuceneTestCase {
+
+  static Directory dir;
+  static IndexReader reader;
+  static IndexSearcher searcher;
+  private static Similarity sim = new PreciseDefaultSimilarity();
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    dir = newDirectory();
+    IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwConfig.setMergePolicy(newLogMergePolicy());
+    iwConfig.setSimilarity(sim);
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
+
+    Document doc = new Document();
+    doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new TextField("text", "second test", Field.Store.NO));
+    iw.addDocument(doc);
+
+    reader = iw.getReader();
+    searcher = newSearcher(reader);
+    iw.close();
+  }
+
+  @AfterClass
+  public static void afterClass() throws Exception {
+    searcher = null;
+    reader.close();
+    reader = null;
+    dir.close();
+    dir = null;
+  }
+
+  public void testNorm() throws Exception {
+    Similarity saved = searcher.getSimilarity();
+    try {
+      // no norm field (so agnostic to indexed similarity)
+      searcher.setSimilarity(sim);
+      assertHits(new FunctionQuery(
+          new NormValueSource("text")),
+          new float[] { 0f, 0f });
+    } finally {
+      searcher.setSimilarity(saved);
+    }
+  }
+
+  void assertHits(Query q, float scores[]) throws Exception {
+    ScoreDoc expected[] = new ScoreDoc[scores.length];
+    int expectedDocs[] = new int[scores.length];
+    for (int i = 0; i < expected.length; i++) {
+      expectedDocs[i] = i;
+      expected[i] = new ScoreDoc(i, scores[i]);
+    }
+    TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
+
+    /*
+    for (int i=0;i<docs.scoreDocs.length;i++) {
+      System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
+    }
+    */
+
+    CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
+    CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
+    CheckHits.checkExplanations(q, "", searcher);
+  }
+}
+
+
+/** Encodes norm as 4-byte float. */
+class PreciseDefaultSimilarity extends TFIDFSimilarity {
+
+  /** Sole constructor: parameter-free */
+  public PreciseDefaultSimilarity() {}
+
+  /** Implemented as <code>overlap / maxOverlap</code>. */
+  @Override
+  public float coord(int overlap, int maxOverlap) {
+    return overlap / (float)maxOverlap;
+  }
+
+  /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
+  @Override
+  public float queryNorm(float sumOfSquaredWeights) {
+    return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
+  }
+
+  /**
+   * Encodes a normalization factor for storage in an index.
+   * <p>
+   * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+   * zero-exponent point at 15, thus representing values from around 7x10^9 to
+   * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+   * represented. Negative numbers are rounded up to zero. Values too large to
+   * represent are rounded down to the largest representable value. Positive
+   * values too small to represent are rounded up to the smallest positive
+   * representable value.
+   *
+   * @see org.apache.lucene.document.Field#setBoost(float)
+   * @see org.apache.lucene.util.SmallFloat
+   */
+  @Override
+  public final long encodeNormValue(float f) {
+    return Float.floatToIntBits(f);
+  }
+
+  /**
+   * Decodes the norm value, assuming it is a single byte.
+   *
+   * @see #encodeNormValue(float)
+   */
+  @Override
+  public final float decodeNormValue(long norm) {
+    return Float.intBitsToFloat((int)norm);
+  }
+
+  /** Implemented as
+   *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
+   *  <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
+   *  #setDiscountOverlaps} is false, else it's {@link
+   *  org.apache.lucene.index.FieldInvertState#getLength()} - {@link
+   *  org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
+   *
+   *  @lucene.experimental */
+  @Override
+  public float lengthNorm(FieldInvertState state) {
+    final int numTerms;
+    if (discountOverlaps) {
+      numTerms = state.getLength() - state.getNumOverlap();
+    } else {
+      numTerms = state.getLength();
+    }
+    return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
+  }
+
+  /** Implemented as <code>sqrt(freq)</code>. */
+  @Override
+  public float tf(float freq) {
+    return (float)Math.sqrt(freq);
+  }
+
+  /** Implemented as <code>1 / (distance + 1)</code>. */
+  @Override
+  public float sloppyFreq(int distance) {
+    return 1.0f / (distance + 1);
+  }
+
+  /** The default implementation returns <code>1</code> */
+  @Override
+  public float scorePayload(int doc, int start, int end, BytesRef payload) {
+    return 1;
+  }
+
+  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  @Override
+  public float idf(long docFreq, long numDocs) {
+    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  }
+
+  /**
+   * True if overlap tokens (tokens with a position of increment of zero) are
+   * discounted from the document's length.
+   */
+  protected boolean discountOverlaps = true;
+
+  /** Determines whether overlap tokens (Tokens with
+   *  0 position increment) are ignored when computing
+   *  norm.  By default this is true, meaning overlap
+   *  tokens do not count when computing norms.
+   *
+   *  @lucene.experimental
+   *
+   *  @see #computeNorm
+   */
+  public void setDiscountOverlaps(boolean v) {
+    discountOverlaps = v;
+  }
+
+  /**
+   * Returns true if overlap tokens are discounted from the document's length.
+   * @see #setDiscountOverlaps
+   */
+  public boolean getDiscountOverlaps() {
+    return discountOverlaps;
+  }
+
+  @Override
+  public String toString() {
+    return "DefaultSimilarity";
+  }
+}