LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-01-31 12:16:31 +00:00
parent fa2423a2ec
commit f335512f02
4 changed files with 254 additions and 7 deletions

View File

@ -217,6 +217,10 @@ Bug fixes
return any groups when the joined query required more than one
rewrite step (Peng Cheng via Mike McCandless)
* LUCENE-5398: NormValueSource was incorrectly casting the long value
to byte, before calling Similarity.decodeNormValue. (Peng Cheng via
Mike McCandless)
API Changes
* LUCENE-5339: The facet module was simplified/reworked to make the

View File

@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
Field newField = new Field(field.name(), (FieldType) field.fieldType());
newField.fieldsData = field.stringValue();
if (newField.fieldsData == null)
if (newField.fieldsData == null) {
newField.fieldsData = field.numericValue();
if (newField.fieldsData == null)
}
if (newField.fieldsData == null) {
newField.fieldsData = field.binaryValue();
if (newField.fieldsData == null)
}
if (newField.fieldsData == null) {
newField.fieldsData = field.readerValue();
}
add(newField);
}
}
}
/**
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
for (int i = 0; i < fields.size(); i++) {
IndexableField field = fields.get(i);
buffer.append(field.toString());
if (i != fields.size()-1)
if (i != fields.size()-1) {
buffer.append(" ");
}
}
buffer.append(">");
return buffer.toString();

View File

@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
return new FloatDocValues(this) {
@Override
public float floatVal(int doc) {
return similarity.decodeNormValue((byte)norms.get(doc));
return similarity.decodeNormValue(norms.get(doc));
}
};
}
@Override
public boolean equals(Object o) {
if (this.getClass() != o.getClass()) return false;
if (this.getClass() != o.getClass()) {
return false;
}
return this.field.equals(((NormValueSource)o).field);
}

View File

@ -0,0 +1,237 @@
package org.apache.lucene.queries.function;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.queries.function.valuesource.NormValueSource;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestLongNormValueSource extends LuceneTestCase {
static Directory dir;
static IndexReader reader;
static IndexSearcher searcher;
private static Similarity sim = new PreciseDefaultSimilarity();
@BeforeClass
public static void beforeClass() throws Exception {
dir = newDirectory();
IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwConfig.setMergePolicy(newLogMergePolicy());
iwConfig.setSimilarity(sim);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
Document doc = new Document();
doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
iw.addDocument(doc);
doc = new Document();
doc.add(new TextField("text", "second test", Field.Store.NO));
iw.addDocument(doc);
reader = iw.getReader();
searcher = newSearcher(reader);
iw.close();
}
@AfterClass
public static void afterClass() throws Exception {
searcher = null;
reader.close();
reader = null;
dir.close();
dir = null;
}
public void testNorm() throws Exception {
Similarity saved = searcher.getSimilarity();
try {
// no norm field (so agnostic to indexed similarity)
searcher.setSimilarity(sim);
assertHits(new FunctionQuery(
new NormValueSource("text")),
new float[] { 0f, 0f });
} finally {
searcher.setSimilarity(saved);
}
}
void assertHits(Query q, float scores[]) throws Exception {
ScoreDoc expected[] = new ScoreDoc[scores.length];
int expectedDocs[] = new int[scores.length];
for (int i = 0; i < expected.length; i++) {
expectedDocs[i] = i;
expected[i] = new ScoreDoc(i, scores[i]);
}
TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
/*
for (int i=0;i<docs.scoreDocs.length;i++) {
System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
}
*/
CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
CheckHits.checkExplanations(q, "", searcher);
}
}
/** Encodes norm as 4-byte float. */
class PreciseDefaultSimilarity extends TFIDFSimilarity {
/** Sole constructor: parameter-free */
public PreciseDefaultSimilarity() {}
/** Implemented as <code>overlap / maxOverlap</code>. */
@Override
public float coord(int overlap, int maxOverlap) {
return overlap / (float)maxOverlap;
}
/** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
@Override
public float queryNorm(float sumOfSquaredWeights) {
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return Float.floatToIntBits(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return Float.intBitsToFloat((int)norm);
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
* #setDiscountOverlaps} is false, else it's {@link
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
*
* @lucene.experimental */
@Override
public float lengthNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
}
/** Implemented as <code>sqrt(freq)</code>. */
@Override
public float tf(float freq) {
return (float)Math.sqrt(freq);
}
/** Implemented as <code>1 / (distance + 1)</code>. */
@Override
public float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
/** The default implementation returns <code>1</code> */
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
@Override
public float idf(long docFreq, long numDocs) {
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {
return "DefaultSimilarity";
}
}