From f335512f02ed59ab68115ef837fa133cb40ac2eb Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 31 Jan 2014 12:16:31 +0000 Subject: [PATCH] LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../org/apache/lucene/document/Document.java | 14 +- .../function/valuesource/NormValueSource.java | 6 +- .../function/TestLongNormValueSource.java | 237 ++++++++++++++++++ 4 files changed, 254 insertions(+), 7 deletions(-) create mode 100644 lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 14c5abbdb2e..58629fb2427 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -217,6 +217,10 @@ Bug fixes return any groups when the joined query required more than one rewrite step (Peng Cheng via Mike McCandless) +* LUCENE-5398: NormValueSource was incorrectly casting the long value + to byte, before calling Similarity.decodeNormValue. (Peng Cheng via + Mike McCandless) + API Changes * LUCENE-5339: The facet module was simplified/reworked to make the diff --git a/lucene/core/src/java/org/apache/lucene/document/Document.java b/lucene/core/src/java/org/apache/lucene/document/Document.java index b7e86e10bcc..1ce27aca6a0 100644 --- a/lucene/core/src/java/org/apache/lucene/document/Document.java +++ b/lucene/core/src/java/org/apache/lucene/document/Document.java @@ -60,16 +60,19 @@ public final class Document implements IndexDocument { Field newField = new Field(field.name(), (FieldType) field.fieldType()); newField.fieldsData = field.stringValue(); - if (newField.fieldsData == null) + if (newField.fieldsData == null) { newField.fieldsData = field.numericValue(); - if (newField.fieldsData == null) + } + if (newField.fieldsData == null) { newField.fieldsData = field.binaryValue(); - if (newField.fieldsData == null) + } + if (newField.fieldsData == null) { newField.fieldsData = field.readerValue(); + } add(newField); } - } + } /** @@ -273,8 +276,9 @@ public final class Document implements IndexDocument { for (int i = 0; i < fields.size(); i++) { IndexableField field = fields.get(i); buffer.append(field.toString()); - if (i != fields.size()-1) + if (i != fields.size()-1) { buffer.append(" "); + } } buffer.append(">"); return buffer.toString(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java index c6b86aedce1..81e4067bec7 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java @@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource { return new FloatDocValues(this) { @Override public float floatVal(int doc) { - return similarity.decodeNormValue((byte)norms.get(doc)); + return similarity.decodeNormValue(norms.get(doc)); } }; } @Override public boolean equals(Object o) { - if (this.getClass() != o.getClass()) return false; + if (this.getClass() != o.getClass()) { + return false; + } return this.field.equals(((NormValueSource)o).field); } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java new file mode 100644 index 00000000000..c4be0247e2e --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java @@ -0,0 +1,237 @@ +package org.apache.lucene.queries.function; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.queries.function.valuesource.NormValueSource; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public class TestLongNormValueSource extends LuceneTestCase { + + static Directory dir; + static IndexReader reader; + static IndexSearcher searcher; + private static Similarity sim = new PreciseDefaultSimilarity(); + + @BeforeClass + public static void beforeClass() throws Exception { + dir = newDirectory(); + IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwConfig.setMergePolicy(newLogMergePolicy()); + iwConfig.setSimilarity(sim); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig); + + Document doc = new Document(); + doc.add(new TextField("text", "this is a test test test", Field.Store.NO)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("text", "second test", Field.Store.NO)); + iw.addDocument(doc); + + reader = iw.getReader(); + searcher = newSearcher(reader); + iw.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + searcher = null; + reader.close(); + reader = null; + dir.close(); + dir = null; + } + + public void testNorm() throws Exception { + Similarity saved = searcher.getSimilarity(); + try { + // no norm field (so agnostic to indexed similarity) + searcher.setSimilarity(sim); + assertHits(new FunctionQuery( + new NormValueSource("text")), + new float[] { 0f, 0f }); + } finally { + searcher.setSimilarity(saved); + } + } + + void assertHits(Query q, float scores[]) throws Exception { + ScoreDoc expected[] = new ScoreDoc[scores.length]; + int expectedDocs[] = new int[scores.length]; + for (int i = 0; i < expected.length; i++) { + expectedDocs[i] = i; + expected[i] = new ScoreDoc(i, scores[i]); + } + TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING))); + + /* + for (int i=0;ioverlap / maxOverlap. */ + @Override + public float coord(int overlap, int maxOverlap) { + return overlap / (float)maxOverlap; + } + + /** Implemented as 1/sqrt(sumOfSquaredWeights). */ + @Override + public float queryNorm(float sumOfSquaredWeights) { + return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); + } + + /** + * Encodes a normalization factor for storage in an index. + *

+ * The encoding uses a three-bit mantissa, a five-bit exponent, and the + * zero-exponent point at 15, thus representing values from around 7x10^9 to + * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also + * represented. Negative numbers are rounded up to zero. Values too large to + * represent are rounded down to the largest representable value. Positive + * values too small to represent are rounded up to the smallest positive + * representable value. + * + * @see org.apache.lucene.document.Field#setBoost(float) + * @see org.apache.lucene.util.SmallFloat + */ + @Override + public final long encodeNormValue(float f) { + return Float.floatToIntBits(f); + } + + /** + * Decodes the norm value, assuming it is a single byte. + * + * @see #encodeNormValue(float) + */ + @Override + public final float decodeNormValue(long norm) { + return Float.intBitsToFloat((int)norm); + } + + /** Implemented as + * state.getBoost()*lengthNorm(numTerms), where + * numTerms is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link + * #setDiscountOverlaps} is false, else it's {@link + * org.apache.lucene.index.FieldInvertState#getLength()} - {@link + * org.apache.lucene.index.FieldInvertState#getNumOverlap()}. + * + * @lucene.experimental */ + @Override + public float lengthNorm(FieldInvertState state) { + final int numTerms; + if (discountOverlaps) { + numTerms = state.getLength() - state.getNumOverlap(); + } else { + numTerms = state.getLength(); + } + return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); + } + + /** Implemented as sqrt(freq). */ + @Override + public float tf(float freq) { + return (float)Math.sqrt(freq); + } + + /** Implemented as 1 / (distance + 1). */ + @Override + public float sloppyFreq(int distance) { + return 1.0f / (distance + 1); + } + + /** The default implementation returns 1 */ + @Override + public float scorePayload(int doc, int start, int end, BytesRef payload) { + return 1; + } + + /** Implemented as log(numDocs/(docFreq+1)) + 1. */ + @Override + public float idf(long docFreq, long numDocs) { + return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); + } + + /** + * True if overlap tokens (tokens with a position of increment of zero) are + * discounted from the document's length. + */ + protected boolean discountOverlaps = true; + + /** Determines whether overlap tokens (Tokens with + * 0 position increment) are ignored when computing + * norm. By default this is true, meaning overlap + * tokens do not count when computing norms. + * + * @lucene.experimental + * + * @see #computeNorm + */ + public void setDiscountOverlaps(boolean v) { + discountOverlaps = v; + } + + /** + * Returns true if overlap tokens are discounted from the document's length. + * @see #setDiscountOverlaps + */ + public boolean getDiscountOverlaps() { + return discountOverlaps; + } + + @Override + public String toString() { + return "DefaultSimilarity"; + } +}