LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
2014-01-31 12:16:31 +00:00 · 2014-01-31 12:16:31 +00:00 · f335512f02
parent fa2423a2ec
commit f335512f02
4 changed files with 254 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -217,6 +217,10 @@ Bug fixes
  return any groups when the joined query required more than one
  rewrite step (Peng Cheng via Mike McCandless)

+* LUCENE-5398: NormValueSource was incorrectly casting the long value
+  to byte, before calling Similarity.decodeNormValue.  (Peng Cheng via
+  Mike McCandless)
+
 API Changes

 * LUCENE-5339: The facet module was simplified/reworked to make the
--- a/lucene/core/src/java/org/apache/lucene/document/Document.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Document.java
@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
      Field newField = new Field(field.name(), (FieldType) field.fieldType());
     
      newField.fieldsData = field.stringValue();
-      if (newField.fieldsData == null) 
+      if (newField.fieldsData == null) {
        newField.fieldsData = field.numericValue();
-      if (newField.fieldsData == null) 
+      }
+      if (newField.fieldsData == null) {
        newField.fieldsData = field.binaryValue();
-      if (newField.fieldsData == null) 
+      }
+      if (newField.fieldsData == null) {
        newField.fieldsData = field.readerValue();
+      }
     
      add(newField);
    }
- }
+  }

  
  /**
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
    for (int i = 0; i < fields.size(); i++) {
      IndexableField field = fields.get(i);
      buffer.append(field.toString());
-      if (i != fields.size()-1)
+      if (i != fields.size()-1) {
        buffer.append(" ");
+      }
    }
    buffer.append(">");
    return buffer.toString();
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
    return new FloatDocValues(this) {
      @Override
      public float floatVal(int doc) {
-        return similarity.decodeNormValue((byte)norms.get(doc));
+        return similarity.decodeNormValue(norms.get(doc));
      }
    };
  }

  @Override
  public boolean equals(Object o) {
-    if (this.getClass() != o.getClass()) return false;
+    if (this.getClass() != o.getClass()) {
+      return false;
+    }
    return this.field.equals(((NormValueSource)o).field);
  }

--- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
@ -0,0 +1,237 @@
+package org.apache.lucene.queries.function;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.queries.function.valuesource.NormValueSource;
+import org.apache.lucene.search.CheckHits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+public class TestLongNormValueSource extends LuceneTestCase {
+
+  static Directory dir;
+  static IndexReader reader;
+  static IndexSearcher searcher;
+  private static Similarity sim = new PreciseDefaultSimilarity();
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    dir = newDirectory();
+    IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwConfig.setMergePolicy(newLogMergePolicy());
+    iwConfig.setSimilarity(sim);
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
+
+    Document doc = new Document();
+    doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new TextField("text", "second test", Field.Store.NO));
+    iw.addDocument(doc);
+
+    reader = iw.getReader();
+    searcher = newSearcher(reader);
+    iw.close();
+  }
+
+  @AfterClass
+  public static void afterClass() throws Exception {
+    searcher = null;
+    reader.close();
+    reader = null;
+    dir.close();
+    dir = null;
+  }
+
+  public void testNorm() throws Exception {
+    Similarity saved = searcher.getSimilarity();
+    try {
+      // no norm field (so agnostic to indexed similarity)
+      searcher.setSimilarity(sim);
+      assertHits(new FunctionQuery(
+          new NormValueSource("text")),
+          new float[] { 0f, 0f });
+    } finally {
+      searcher.setSimilarity(saved);
+    }
+  }
+
+  void assertHits(Query q, float scores[]) throws Exception {
+    ScoreDoc expected[] = new ScoreDoc[scores.length];
+    int expectedDocs[] = new int[scores.length];
+    for (int i = 0; i < expected.length; i++) {
+      expectedDocs[i] = i;
+      expected[i] = new ScoreDoc(i, scores[i]);
+    }
+    TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
+
+    /*
+    for (int i=0;i<docs.scoreDocs.length;i++) {
+      System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
+    }
+    */
+
+    CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
+    CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
+    CheckHits.checkExplanations(q, "", searcher);
+  }
+}
+
+
+/** Encodes norm as 4-byte float. */
+class PreciseDefaultSimilarity extends TFIDFSimilarity {
+
+  /** Sole constructor: parameter-free */
+  public PreciseDefaultSimilarity() {}
+
+  /** Implemented as <code>overlap / maxOverlap</code>. */
+  @Override
+  public float coord(int overlap, int maxOverlap) {
+    return overlap / (float)maxOverlap;
+  }
+
+  /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
+  @Override
+  public float queryNorm(float sumOfSquaredWeights) {
+    return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
+  }
+
+  /**
+   * Encodes a normalization factor for storage in an index.
+   * <p>
+   * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+   * zero-exponent point at 15, thus representing values from around 7x10^9 to
+   * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+   * represented. Negative numbers are rounded up to zero. Values too large to
+   * represent are rounded down to the largest representable value. Positive
+   * values too small to represent are rounded up to the smallest positive
+   * representable value.
+   *
+   * @see org.apache.lucene.document.Field#setBoost(float)
+   * @see org.apache.lucene.util.SmallFloat
+   */
+  @Override
+  public final long encodeNormValue(float f) {
+    return Float.floatToIntBits(f);
+  }
+
+  /**
+   * Decodes the norm value, assuming it is a single byte.
+   *
+   * @see #encodeNormValue(float)
+   */
+  @Override
+  public final float decodeNormValue(long norm) {
+    return Float.intBitsToFloat((int)norm);
+  }
+
+  /** Implemented as
+   *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
+   *  <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
+   *  #setDiscountOverlaps} is false, else it's {@link
+   *  org.apache.lucene.index.FieldInvertState#getLength()} - {@link
+   *  org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
+   *
+   *  @lucene.experimental */
+  @Override
+  public float lengthNorm(FieldInvertState state) {
+    final int numTerms;
+    if (discountOverlaps) {
+      numTerms = state.getLength() - state.getNumOverlap();
+    } else {
+      numTerms = state.getLength();
+    }
+    return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
+  }
+
+  /** Implemented as <code>sqrt(freq)</code>. */
+  @Override
+  public float tf(float freq) {
+    return (float)Math.sqrt(freq);
+  }
+
+  /** Implemented as <code>1 / (distance + 1)</code>. */
+  @Override
+  public float sloppyFreq(int distance) {
+    return 1.0f / (distance + 1);
+  }
+
+  /** The default implementation returns <code>1</code> */
+  @Override
+  public float scorePayload(int doc, int start, int end, BytesRef payload) {
+    return 1;
+  }
+
+  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  @Override
+  public float idf(long docFreq, long numDocs) {
+    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  }
+
+  /**
+   * True if overlap tokens (tokens with a position of increment of zero) are
+   * discounted from the document's length.
+   */
+  protected boolean discountOverlaps = true;
+
+  /** Determines whether overlap tokens (Tokens with
+   *  0 position increment) are ignored when computing
+   *  norm.  By default this is true, meaning overlap
+   *  tokens do not count when computing norms.
+   *
+   *  @lucene.experimental
+   *
+   *  @see #computeNorm
+   */
+  public void setDiscountOverlaps(boolean v) {
+    discountOverlaps = v;
+  }
+
+  /**
+   * Returns true if overlap tokens are discounted from the document's length.
+   * @see #setDiscountOverlaps
+   */
+  public boolean getDiscountOverlaps() {
+    return discountOverlaps;
+  }
+
+  @Override
+  public String toString() {
+    return "DefaultSimilarity";
+  }
+}