LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-01-31 12:16:31 +00:00
parent fa2423a2ec
commit f335512f02
4 changed files with 254 additions and 7 deletions

View File

@ -217,6 +217,10 @@ Bug fixes
return any groups when the joined query required more than one return any groups when the joined query required more than one
rewrite step (Peng Cheng via Mike McCandless) rewrite step (Peng Cheng via Mike McCandless)
* LUCENE-5398: NormValueSource was incorrectly casting the long value
to byte, before calling Similarity.decodeNormValue. (Peng Cheng via
Mike McCandless)
API Changes API Changes
* LUCENE-5339: The facet module was simplified/reworked to make the * LUCENE-5339: The facet module was simplified/reworked to make the

View File

@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
Field newField = new Field(field.name(), (FieldType) field.fieldType()); Field newField = new Field(field.name(), (FieldType) field.fieldType());
newField.fieldsData = field.stringValue(); newField.fieldsData = field.stringValue();
if (newField.fieldsData == null) if (newField.fieldsData == null) {
newField.fieldsData = field.numericValue(); newField.fieldsData = field.numericValue();
if (newField.fieldsData == null) }
if (newField.fieldsData == null) {
newField.fieldsData = field.binaryValue(); newField.fieldsData = field.binaryValue();
if (newField.fieldsData == null) }
if (newField.fieldsData == null) {
newField.fieldsData = field.readerValue(); newField.fieldsData = field.readerValue();
}
add(newField); add(newField);
} }
} }
/** /**
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
for (int i = 0; i < fields.size(); i++) { for (int i = 0; i < fields.size(); i++) {
IndexableField field = fields.get(i); IndexableField field = fields.get(i);
buffer.append(field.toString()); buffer.append(field.toString());
if (i != fields.size()-1) if (i != fields.size()-1) {
buffer.append(" "); buffer.append(" ");
}
} }
buffer.append(">"); buffer.append(">");
return buffer.toString(); return buffer.toString();

View File

@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
return new FloatDocValues(this) { return new FloatDocValues(this) {
@Override @Override
public float floatVal(int doc) { public float floatVal(int doc) {
return similarity.decodeNormValue((byte)norms.get(doc)); return similarity.decodeNormValue(norms.get(doc));
} }
}; };
} }
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (this.getClass() != o.getClass()) return false; if (this.getClass() != o.getClass()) {
return false;
}
return this.field.equals(((NormValueSource)o).field); return this.field.equals(((NormValueSource)o).field);
} }

View File

@ -0,0 +1,237 @@
package org.apache.lucene.queries.function;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.queries.function.valuesource.NormValueSource;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestLongNormValueSource extends LuceneTestCase {
static Directory dir;
static IndexReader reader;
static IndexSearcher searcher;
private static Similarity sim = new PreciseDefaultSimilarity();
@BeforeClass
public static void beforeClass() throws Exception {
dir = newDirectory();
IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwConfig.setMergePolicy(newLogMergePolicy());
iwConfig.setSimilarity(sim);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
Document doc = new Document();
doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
iw.addDocument(doc);
doc = new Document();
doc.add(new TextField("text", "second test", Field.Store.NO));
iw.addDocument(doc);
reader = iw.getReader();
searcher = newSearcher(reader);
iw.close();
}
@AfterClass
public static void afterClass() throws Exception {
searcher = null;
reader.close();
reader = null;
dir.close();
dir = null;
}
public void testNorm() throws Exception {
Similarity saved = searcher.getSimilarity();
try {
// no norm field (so agnostic to indexed similarity)
searcher.setSimilarity(sim);
assertHits(new FunctionQuery(
new NormValueSource("text")),
new float[] { 0f, 0f });
} finally {
searcher.setSimilarity(saved);
}
}
void assertHits(Query q, float scores[]) throws Exception {
ScoreDoc expected[] = new ScoreDoc[scores.length];
int expectedDocs[] = new int[scores.length];
for (int i = 0; i < expected.length; i++) {
expectedDocs[i] = i;
expected[i] = new ScoreDoc(i, scores[i]);
}
TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
/*
for (int i=0;i<docs.scoreDocs.length;i++) {
System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
}
*/
CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
CheckHits.checkExplanations(q, "", searcher);
}
}
/** Encodes norm as 4-byte float. */
class PreciseDefaultSimilarity extends TFIDFSimilarity {
/** Sole constructor: parameter-free */
public PreciseDefaultSimilarity() {}
/** Implemented as <code>overlap / maxOverlap</code>. */
@Override
public float coord(int overlap, int maxOverlap) {
return overlap / (float)maxOverlap;
}
/** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
@Override
public float queryNorm(float sumOfSquaredWeights) {
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return Float.floatToIntBits(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return Float.intBitsToFloat((int)norm);
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
* #setDiscountOverlaps} is false, else it's {@link
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
*
* @lucene.experimental */
@Override
public float lengthNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
}
/** Implemented as <code>sqrt(freq)</code>. */
@Override
public float tf(float freq) {
return (float)Math.sqrt(freq);
}
/** Implemented as <code>1 / (distance + 1)</code>. */
@Override
public float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
/** The default implementation returns <code>1</code> */
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
@Override
public float idf(long docFreq, long numDocs) {
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {
return "DefaultSimilarity";
}
}