mirror of https://github.com/apache/lucene.git
LUCENE-5398: remove invalid byte cast in NormValueSource, since TFIDFSimilarity now allows for norms larger than one byte
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563119 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fa2423a2ec
commit
f335512f02
|
@ -217,6 +217,10 @@ Bug fixes
|
||||||
return any groups when the joined query required more than one
|
return any groups when the joined query required more than one
|
||||||
rewrite step (Peng Cheng via Mike McCandless)
|
rewrite step (Peng Cheng via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-5398: NormValueSource was incorrectly casting the long value
|
||||||
|
to byte, before calling Similarity.decodeNormValue. (Peng Cheng via
|
||||||
|
Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5339: The facet module was simplified/reworked to make the
|
* LUCENE-5339: The facet module was simplified/reworked to make the
|
||||||
|
|
|
@ -60,16 +60,19 @@ public final class Document implements IndexDocument {
|
||||||
Field newField = new Field(field.name(), (FieldType) field.fieldType());
|
Field newField = new Field(field.name(), (FieldType) field.fieldType());
|
||||||
|
|
||||||
newField.fieldsData = field.stringValue();
|
newField.fieldsData = field.stringValue();
|
||||||
if (newField.fieldsData == null)
|
if (newField.fieldsData == null) {
|
||||||
newField.fieldsData = field.numericValue();
|
newField.fieldsData = field.numericValue();
|
||||||
if (newField.fieldsData == null)
|
}
|
||||||
|
if (newField.fieldsData == null) {
|
||||||
newField.fieldsData = field.binaryValue();
|
newField.fieldsData = field.binaryValue();
|
||||||
if (newField.fieldsData == null)
|
}
|
||||||
|
if (newField.fieldsData == null) {
|
||||||
newField.fieldsData = field.readerValue();
|
newField.fieldsData = field.readerValue();
|
||||||
|
}
|
||||||
|
|
||||||
add(newField);
|
add(newField);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -273,8 +276,9 @@ public final class Document implements IndexDocument {
|
||||||
for (int i = 0; i < fields.size(); i++) {
|
for (int i = 0; i < fields.size(); i++) {
|
||||||
IndexableField field = fields.get(i);
|
IndexableField field = fields.get(i);
|
||||||
buffer.append(field.toString());
|
buffer.append(field.toString());
|
||||||
if (i != fields.size()-1)
|
if (i != fields.size()-1) {
|
||||||
buffer.append(" ");
|
buffer.append(" ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
buffer.append(">");
|
buffer.append(">");
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
|
|
|
@ -71,14 +71,16 @@ public class NormValueSource extends ValueSource {
|
||||||
return new FloatDocValues(this) {
|
return new FloatDocValues(this) {
|
||||||
@Override
|
@Override
|
||||||
public float floatVal(int doc) {
|
public float floatVal(int doc) {
|
||||||
return similarity.decodeNormValue((byte)norms.get(doc));
|
return similarity.decodeNormValue(norms.get(doc));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this.getClass() != o.getClass()) return false;
|
if (this.getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return this.field.equals(((NormValueSource)o).field);
|
return this.field.equals(((NormValueSource)o).field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,237 @@
|
||||||
|
package org.apache.lucene.queries.function;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.queries.function.valuesource.NormValueSource;
|
||||||
|
import org.apache.lucene.search.CheckHits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.apache.lucene.search.SortField;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class TestLongNormValueSource extends LuceneTestCase {
|
||||||
|
|
||||||
|
static Directory dir;
|
||||||
|
static IndexReader reader;
|
||||||
|
static IndexSearcher searcher;
|
||||||
|
private static Similarity sim = new PreciseDefaultSimilarity();
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
dir = newDirectory();
|
||||||
|
IndexWriterConfig iwConfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||||
|
iwConfig.setMergePolicy(newLogMergePolicy());
|
||||||
|
iwConfig.setSimilarity(sim);
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConfig);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new TextField("text", "this is a test test test", Field.Store.NO));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new TextField("text", "second test", Field.Store.NO));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
reader = iw.getReader();
|
||||||
|
searcher = newSearcher(reader);
|
||||||
|
iw.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() throws Exception {
|
||||||
|
searcher = null;
|
||||||
|
reader.close();
|
||||||
|
reader = null;
|
||||||
|
dir.close();
|
||||||
|
dir = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNorm() throws Exception {
|
||||||
|
Similarity saved = searcher.getSimilarity();
|
||||||
|
try {
|
||||||
|
// no norm field (so agnostic to indexed similarity)
|
||||||
|
searcher.setSimilarity(sim);
|
||||||
|
assertHits(new FunctionQuery(
|
||||||
|
new NormValueSource("text")),
|
||||||
|
new float[] { 0f, 0f });
|
||||||
|
} finally {
|
||||||
|
searcher.setSimilarity(saved);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void assertHits(Query q, float scores[]) throws Exception {
|
||||||
|
ScoreDoc expected[] = new ScoreDoc[scores.length];
|
||||||
|
int expectedDocs[] = new int[scores.length];
|
||||||
|
for (int i = 0; i < expected.length; i++) {
|
||||||
|
expectedDocs[i] = i;
|
||||||
|
expected[i] = new ScoreDoc(i, scores[i]);
|
||||||
|
}
|
||||||
|
TopDocs docs = searcher.search(q, 2, new Sort(new SortField("id", SortField.Type.STRING)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
for (int i=0;i<docs.scoreDocs.length;i++) {
|
||||||
|
System.out.println(searcher.explain(q, docs.scoreDocs[i].doc));
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
CheckHits.checkHits(random(), q, "", searcher, expectedDocs);
|
||||||
|
CheckHits.checkHitsQuery(q, expected, docs.scoreDocs, expectedDocs);
|
||||||
|
CheckHits.checkExplanations(q, "", searcher);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Encodes norm as 4-byte float. */
|
||||||
|
class PreciseDefaultSimilarity extends TFIDFSimilarity {
|
||||||
|
|
||||||
|
/** Sole constructor: parameter-free */
|
||||||
|
public PreciseDefaultSimilarity() {}
|
||||||
|
|
||||||
|
/** Implemented as <code>overlap / maxOverlap</code>. */
|
||||||
|
@Override
|
||||||
|
public float coord(int overlap, int maxOverlap) {
|
||||||
|
return overlap / (float)maxOverlap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
|
||||||
|
@Override
|
||||||
|
public float queryNorm(float sumOfSquaredWeights) {
|
||||||
|
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes a normalization factor for storage in an index.
|
||||||
|
* <p>
|
||||||
|
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||||
|
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||||
|
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||||
|
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||||
|
* represent are rounded down to the largest representable value. Positive
|
||||||
|
* values too small to represent are rounded up to the smallest positive
|
||||||
|
* representable value.
|
||||||
|
*
|
||||||
|
* @see org.apache.lucene.document.Field#setBoost(float)
|
||||||
|
* @see org.apache.lucene.util.SmallFloat
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public final long encodeNormValue(float f) {
|
||||||
|
return Float.floatToIntBits(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decodes the norm value, assuming it is a single byte.
|
||||||
|
*
|
||||||
|
* @see #encodeNormValue(float)
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public final float decodeNormValue(long norm) {
|
||||||
|
return Float.intBitsToFloat((int)norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as
|
||||||
|
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||||
|
* <code>numTerms</code> is {@link org.apache.lucene.index.FieldInvertState#getLength()} if {@link
|
||||||
|
* #setDiscountOverlaps} is false, else it's {@link
|
||||||
|
* org.apache.lucene.index.FieldInvertState#getLength()} - {@link
|
||||||
|
* org.apache.lucene.index.FieldInvertState#getNumOverlap()}.
|
||||||
|
*
|
||||||
|
* @lucene.experimental */
|
||||||
|
@Override
|
||||||
|
public float lengthNorm(FieldInvertState state) {
|
||||||
|
final int numTerms;
|
||||||
|
if (discountOverlaps) {
|
||||||
|
numTerms = state.getLength() - state.getNumOverlap();
|
||||||
|
} else {
|
||||||
|
numTerms = state.getLength();
|
||||||
|
}
|
||||||
|
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>sqrt(freq)</code>. */
|
||||||
|
@Override
|
||||||
|
public float tf(float freq) {
|
||||||
|
return (float)Math.sqrt(freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||||
|
@Override
|
||||||
|
public float sloppyFreq(int distance) {
|
||||||
|
return 1.0f / (distance + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The default implementation returns <code>1</code> */
|
||||||
|
@Override
|
||||||
|
public float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||||
|
@Override
|
||||||
|
public float idf(long docFreq, long numDocs) {
|
||||||
|
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||||
|
* discounted from the document's length.
|
||||||
|
*/
|
||||||
|
protected boolean discountOverlaps = true;
|
||||||
|
|
||||||
|
/** Determines whether overlap tokens (Tokens with
|
||||||
|
* 0 position increment) are ignored when computing
|
||||||
|
* norm. By default this is true, meaning overlap
|
||||||
|
* tokens do not count when computing norms.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*
|
||||||
|
* @see #computeNorm
|
||||||
|
*/
|
||||||
|
public void setDiscountOverlaps(boolean v) {
|
||||||
|
discountOverlaps = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if overlap tokens are discounted from the document's length.
|
||||||
|
* @see #setDiscountOverlaps
|
||||||
|
*/
|
||||||
|
public boolean getDiscountOverlaps() {
|
||||||
|
return discountOverlaps;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "DefaultSimilarity";
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue