LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
2014-01-31 12:16:31 +00:00 · 2014-01-31 12:16:31 +00:00 · f335512f02
parent fa2423a2ec
commit f335512f02
4 changed files with 254 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -217,6 +217,10 @@ Bug fixes
  return any groups when the joined query required more than one
  rewrite step (Peng Cheng via Mike McCandless)
 * LUCENE-5398: NormValueSource was incorrectly casting the long value
  to byte, before calling Similarity.decodeNormValue.  (Peng Cheng via
  Mike McCandless)
 API Changes
 * LUCENE-5339: The facet module was simplified/reworked to make the
--- a/lucene/core/src/java/org/apache/lucene/document/Document.java
+++ b/lucene/core/src/java/org/apache/lucene/document/Document.java
@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
      Field newField = new Field(field.name(), (FieldType) field.fieldType());
      newField.fieldsData = field.stringValue();
-      if (newField.fieldsData == null) 
+      if (newField.fieldsData == null) {
        newField.fieldsData = field.numericValue();
-      if (newField.fieldsData == null) 
+      }
      if (newField.fieldsData == null) {
        newField.fieldsData = field.binaryValue();
-      if (newField.fieldsData == null) 
+      }
      if (newField.fieldsData == null) {
        newField.fieldsData = field.readerValue();
      }
      add(newField);
    }
- }
+  }
  /**
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
    for (int i = 0; i < fields.size(); i++) {
      IndexableField field = fields.get(i);
      buffer.append(field.toString());
-      if (i != fields.size()-1)
+      if (i != fields.size()-1) {
        buffer.append(" ");
      }
    }
    buffer.append(">");
    return buffer.toString();
--- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java
@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
    return new FloatDocValues(this) {
      @Override
      public float floatVal(int doc) {
-        return similarity.decodeNormValue((byte)norms.get(doc));
+        return similarity.decodeNormValue(norms.get(doc));
      }
    };
  }
  @Override
  public boolean equals(Object o) {
-    if (this.getClass() != o.getClass()) return false;
+    if (this.getClass() != o.getClass()) {
      return false;
    }
    return this.field.equals(((NormValueSource)o).field);
  }
--- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
@ -0,0 +1,237 @@
 package org.apache.lucene.queries.function;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.queries.function.valuesource.NormValueSource;
 import org.apache.lucene.search.CheckHits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 public class TestLongNormValueSource extends LuceneTestCase {
  static Directory dir;
  static IndexReader reader;
  static IndexSearcher searcher;
  private static Similarity sim = new PreciseDefaultSimilarity();
  @BeforeClass
  public static void beforeClass() throws Exception {
    dir = newDirectory();
    IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
    iwConfig.setMergePolicy(newLogMergePolicy());
    iwConfig.setSimilarity(sim);
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
    Document doc = new Document();
    doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
    iw.addDocument(doc);
    doc = new Document();
    doc.add(new TextField("text", "second test", Field.Store.NO));
    iw.addDocument(doc);
    reader = iw.getReader();
    searcher = newSearcher(reader);
    iw.close();
  }
  @AfterClass
  public static void afterClass() throws Exception {
    searcher = null;
    reader.close();
    reader = null;
    dir.close();
    dir = null;
  }
  public void testNorm() throws Exception {
    Similarity saved = searcher.getSimilarity();
    try {
      // no norm field (so agnostic to indexed similarity)
      searcher.setSimilarity(sim);
      assertHits(new FunctionQuery(
          new NormValueSource("text")),
          new float[] { 0f, 0f });
    } finally {
      searcher.setSimilarity(saved);
    }
  }
  void assertHits(Query q, float scores[]) throws Exception {
    ScoreDoc expected[] = new ScoreDoc[scores.length];
    int expectedDocs[] = new int[scores.length];
    for (int i = 0; i < expected.length; i++) {
      expectedDocs[i] = i;
      expected[i] = new ScoreDoc(i, scores[i]);
    }
    TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
    /*
    for (int i=0;i<docs.scoreDocs.length;i++) {
      System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
    }
    */
    CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
    CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
    CheckHits.checkExplanations(q, "", searcher);
  }
 }
 /** Encodes norm as 4-byte float. */
 class PreciseDefaultSimilarity extends TFIDFSimilarity {
  /** Sole constructor: parameter-free */
  public PreciseDefaultSimilarity() {}
  /** Implemented as <code>overlap / maxOverlap</code>. */
  @Override
  public float coord(int overlap, int maxOverlap) {
    return overlap / (float)maxOverlap;
  }
  /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
  @Override
  public float queryNorm(float sumOfSquaredWeights) {
    return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
  }
  /**
   * Encodes a normalization factor for storage in an index.
   * <p>
   * The encoding uses a three-bit mantissa, a five-bit exponent, and the
   * zero-exponent point at 15, thus representing values from around 7x10^9 to
   * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
   * represented. Negative numbers are rounded up to zero. Values too large to
   * represent are rounded down to the largest representable value. Positive
   * values too small to represent are rounded up to the smallest positive
   * representable value.
   *
   * @see org.apache.lucene.document.Field#setBoost(float)
   * @see org.apache.lucene.util.SmallFloat
   */
  @Override
  public final long encodeNormValue(float f) {
    return Float.floatToIntBits(f);
  }
  /**
   * Decodes the norm value, assuming it is a single byte.
   *
   * @see #encodeNormValue(float)
   */
  @Override
  public final float decodeNormValue(long norm) {
    return Float.intBitsToFloat((int)norm);
  }
  /** Implemented as
   *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
   *  <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
   *  #setDiscountOverlaps} is false, else it's {@link
   *  org.apache.lucene.index.FieldInvertState#getLength()} - {@link
   *  org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
   *
   *  @lucene.experimental */
  @Override
  public float lengthNorm(FieldInvertState state) {
    final int numTerms;
    if (discountOverlaps) {
      numTerms = state.getLength() - state.getNumOverlap();
    } else {
      numTerms = state.getLength();
    }
    return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
  }
  /** Implemented as <code>sqrt(freq)</code>. */
  @Override
  public float tf(float freq) {
    return (float)Math.sqrt(freq);
  }
  /** Implemented as <code>1 / (distance + 1)</code>. */
  @Override
  public float sloppyFreq(int distance) {
    return 1.0f / (distance + 1);
  }
  /** The default implementation returns <code>1</code> */
  @Override
  public float scorePayload(int doc, int start, int end, BytesRef payload) {
    return 1;
  }
  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
  @Override
  public float idf(long docFreq, long numDocs) {
    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
  }
  /**
   * True if overlap tokens (tokens with a position of increment of zero) are
   * discounted from the document's length.
   */
  protected boolean discountOverlaps = true;
  /** Determines whether overlap tokens (Tokens with
   *  0 position increment) are ignored when computing
   *  norm.  By default this is true, meaning overlap
   *  tokens do not count when computing norms.
   *
   *  @lucene.experimental
   *
   *  @see #computeNorm
   */
  public void setDiscountOverlaps(boolean v) {
    discountOverlaps = v;
  }
  /**
   * Returns true if overlap tokens are discounted from the document's length.
   * @see #setDiscountOverlaps
   */
  public boolean getDiscountOverlaps() {
    return discountOverlaps;
  }
  @Override
  public String toString() {
    return "DefaultSimilarity";
  }
 }