mirror of https://github.com/apache/lucene.git
LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fa2423a2ec
commit
f335512f02
|
@ -217,6 +217,10 @@ Bug fixes
|
|||
return any groups when the joined query required more than one
|
||||
rewrite step (Peng Cheng via Mike McCandless)
|
||||
|
||||
* LUCENE-5398: NormValueSource was incorrectly casting the long value
|
||||
to byte, before calling Similarity.decodeNormValue. (Peng Cheng via
|
||||
Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5339: The facet module was simplified/reworked to make the
|
||||
|
|
|
@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
|
|||
Field newField = new Field(field.name(), (FieldType) field.fieldType());
|
||||
|
||||
newField.fieldsData = field.stringValue();
|
||||
if (newField.fieldsData == null)
|
||||
if (newField.fieldsData == null) {
|
||||
newField.fieldsData = field.numericValue();
|
||||
if (newField.fieldsData == null)
|
||||
}
|
||||
if (newField.fieldsData == null) {
|
||||
newField.fieldsData = field.binaryValue();
|
||||
if (newField.fieldsData == null)
|
||||
}
|
||||
if (newField.fieldsData == null) {
|
||||
newField.fieldsData = field.readerValue();
|
||||
}
|
||||
|
||||
add(newField);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
|
|||
for (int i = 0; i < fields.size(); i++) {
|
||||
IndexableField field = fields.get(i);
|
||||
buffer.append(field.toString());
|
||||
if (i != fields.size()-1)
|
||||
if (i != fields.size()-1) {
|
||||
buffer.append(" ");
|
||||
}
|
||||
}
|
||||
buffer.append(">");
|
||||
return buffer.toString();
|
||||
|
|
|
@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
|
|||
return new FloatDocValues(this) {
|
||||
@Override
|
||||
public float floatVal(int doc) {
|
||||
return similarity.decodeNormValue((byte)norms.get(doc));
|
||||
return similarity.decodeNormValue(norms.get(doc));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this.getClass() != o.getClass()) return false;
|
||||
if (this.getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
return this.field.equals(((NormValueSource)o).field);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,237 @@
|
|||
package org.apache.lucene.queries.function;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.queries.function.valuesource.NormValueSource;
|
||||
import org.apache.lucene.search.CheckHits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestLongNormValueSource extends LuceneTestCase {
|
||||
|
||||
static Directory dir;
|
||||
static IndexReader reader;
|
||||
static IndexSearcher searcher;
|
||||
private static Similarity sim = new PreciseDefaultSimilarity();
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
dir = newDirectory();
|
||||
IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwConfig.setMergePolicy(newLogMergePolicy());
|
||||
iwConfig.setSimilarity(sim);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new TextField("text", "second test", Field.Store.NO));
|
||||
iw.addDocument(doc);
|
||||
|
||||
reader = iw.getReader();
|
||||
searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
searcher = null;
|
||||
reader.close();
|
||||
reader = null;
|
||||
dir.close();
|
||||
dir = null;
|
||||
}
|
||||
|
||||
public void testNorm() throws Exception {
|
||||
Similarity saved = searcher.getSimilarity();
|
||||
try {
|
||||
// no norm field (so agnostic to indexed similarity)
|
||||
searcher.setSimilarity(sim);
|
||||
assertHits(new FunctionQuery(
|
||||
new NormValueSource("text")),
|
||||
new float[] { 0f, 0f });
|
||||
} finally {
|
||||
searcher.setSimilarity(saved);
|
||||
}
|
||||
}
|
||||
|
||||
void assertHits(Query q, float scores[]) throws Exception {
|
||||
ScoreDoc expected[] = new ScoreDoc[scores.length];
|
||||
int expectedDocs[] = new int[scores.length];
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
expectedDocs[i] = i;
|
||||
expected[i] = new ScoreDoc(i, scores[i]);
|
||||
}
|
||||
TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
|
||||
|
||||
/*
|
||||
for (int i=0;i<docs.scoreDocs.length;i++) {
|
||||
System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
|
||||
}
|
||||
*/
|
||||
|
||||
CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
|
||||
CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
|
||||
CheckHits.checkExplanations(q, "", searcher);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Encodes norm as 4-byte float. */
|
||||
class PreciseDefaultSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public PreciseDefaultSimilarity() {}
|
||||
|
||||
/** Implemented as <code>overlap / maxOverlap</code>. */
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return overlap / (float)maxOverlap;
|
||||
}
|
||||
|
||||
/** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a normalization factor for storage in an index.
|
||||
* <p>
|
||||
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||
* represent are rounded down to the largest representable value. Positive
|
||||
* values too small to represent are rounded up to the smallest positive
|
||||
* representable value.
|
||||
*
|
||||
* @see org.apache.lucene.document.Field#setBoost(float)
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
@Override
|
||||
public final long encodeNormValue(float f) {
|
||||
return Float.floatToIntBits(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the norm value, assuming it is a single byte.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
@Override
|
||||
public final float decodeNormValue(long norm) {
|
||||
return Float.intBitsToFloat((int)norm);
|
||||
}
|
||||
|
||||
/** Implemented as
|
||||
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
|
||||
* #setDiscountOverlaps} is false, else it's {@link
|
||||
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
|
||||
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
|
||||
}
|
||||
|
||||
/** Implemented as <code>sqrt(freq)</code>. */
|
||||
@Override
|
||||
public float tf(float freq) {
|
||||
return (float)Math.sqrt(freq);
|
||||
}
|
||||
|
||||
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||
@Override
|
||||
public float sloppyFreq(int distance) {
|
||||
return 1.0f / (distance + 1);
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1</code> */
|
||||
@Override
|
||||
public float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DefaultSimilarity";
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue