mirror of https://github.com/apache/lucene.git
LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40d290ee84
commit
9dc0ba4c7b
|
@ -184,6 +184,10 @@ Bug Fixes
|
|||
EOFException if you seek past the end of the file and then try to
|
||||
read (Stéphane Campinas via Mike McCandless)
|
||||
|
||||
* LUCENE-6896: Don't treat the smallest possible norm value as an infinitely
|
||||
long document in SimilarityBase or BM25Similarity. Add more warnings to sims
|
||||
that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)
|
||||
|
|
|
@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity {
|
|||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
|
||||
* and {@code N} is increased by {@code F}
|
||||
* @lucene.experimental
|
||||
* NOTE: in some corner cases this model may give poor performance with Normalizations that
|
||||
* return large values for {@code tfn} such as NormalizationH3. Consider using the
|
||||
* geometric approximation ({@link BasicModelG}) instead, which provides the same relevance
|
||||
* but with less practical problems.
|
||||
* NOTE: in some corner cases this model may give poor performance or infinite scores with
|
||||
* Normalizations that return large or small values for {@code tfn} such as NormalizationH3.
|
||||
* Consider using the geometric approximation ({@link BasicModelG}) instead, which provides
|
||||
* the same relevance but with less practical problems.
|
||||
*/
|
||||
public class BasicModelBE extends BasicModel {
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
* <p>
|
||||
* WARNING: for terms that do not meet the expected random distribution
|
||||
* (e.g. stopwords), this model may give poor performance, such as
|
||||
* abnormally high scores for low tf values.
|
||||
* abnormally high or NaN scores for low tf values.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelD extends BasicModel {
|
||||
|
|
|
@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities;
|
|||
* <p>Unlike for DFR, the natural logarithm is used, as
|
||||
* it is faster to compute and the original paper does not express any
|
||||
* preference to a specific base.</p>
|
||||
* WARNING: this model currently returns infinite scores for very small
|
||||
* tf values and negative scores for very large tf values
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DistributionSPL extends Distribution {
|
||||
|
|
|
@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity {
|
|||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestBM25Similarity extends LuceneTestCase {
|
||||
|
||||
public void testSaneNormValues() {
|
||||
BM25Similarity sim = new BM25Similarity();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
|
||||
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
|
||||
assertFalse("nan len for byte=" + i, Float.isNaN(len));
|
||||
if (i > 0) {
|
||||
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
|||
assertEquals(1, topDocs.scoreDocs.length);
|
||||
assertTrue(topDocs.scoreDocs[0].score != 0);
|
||||
}
|
||||
|
||||
public void testSaneNormValues() {
|
||||
ClassicSimilarity sim = new ClassicSimilarity();
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float boost = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
|
||||
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
|
||||
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
|
||||
if (i > 0) {
|
||||
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase {
|
|||
TopDocs td = is.search(query, 10);
|
||||
assertEquals(1, td.totalHits);
|
||||
float score = td.scoreDocs[0].score;
|
||||
assertTrue(score >= 0.0f);
|
||||
assertFalse("negative score for " + sim, score < 0.0f);
|
||||
assertFalse("inf score for " + sim, Float.isInfinite(score));
|
||||
assertFalse("nan score for " + sim, Float.isNaN(score));
|
||||
}
|
||||
ir.close();
|
||||
dir.close();
|
||||
|
|
|
@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
actual.setDiscountOverlaps(true);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
}
|
||||
|
||||
public void testSaneNormValues() {
|
||||
for (SimilarityBase sim : sims) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
|
||||
assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
|
||||
assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
|
||||
if (i > 0) {
|
||||
assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure the similarity does not go crazy when tested against all possible norm values.
|
||||
*/
|
||||
public void testCrazyIndexTimeBoosts() throws Exception {
|
||||
long avgLength = 750;
|
||||
long docCount = 500000;
|
||||
long numTokens = docCount * avgLength;
|
||||
|
||||
CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
|
||||
|
||||
long docFreq = 2000;
|
||||
long totalTermFreq = 2000 * avgLength;
|
||||
|
||||
TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
|
||||
|
||||
for (SimilarityBase sim : sims) {
|
||||
if (sim instanceof IBSimilarity) {
|
||||
if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
|
||||
// score goes infinite for tiny doc lengths and negative for huge doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
}
|
||||
} else if (sim instanceof DFRSimilarity) {
|
||||
BasicModel model = ((DFRSimilarity)sim).getBasicModel();
|
||||
if (model instanceof BasicModelD || model instanceof BasicModelP) {
|
||||
// score goes NaN for tiny doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
} else if (model instanceof BasicModelBE) {
|
||||
// score goes negative infinity for tiny doc lengths
|
||||
// TODO: fix this
|
||||
continue;
|
||||
}
|
||||
}
|
||||
BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats);
|
||||
for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float len = sim.decodeNormValue((byte) i);
|
||||
float score = sim.score(stats, tf, len);
|
||||
assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
|
||||
assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
|
||||
assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue