mirror of https://github.com/apache/lucene.git
LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40d290ee84
commit
9dc0ba4c7b
|
@ -184,6 +184,10 @@ Bug Fixes
|
||||||
EOFException if you seek past the end of the file and then try to
|
EOFException if you seek past the end of the file and then try to
|
||||||
read (Stéphane Campinas via Mike McCandless)
|
read (Stéphane Campinas via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-6896: Don't treat the smallest possible norm value as an infinitely
|
||||||
|
long document in SimilarityBase or BM25Similarity. Add more warnings to sims
|
||||||
|
that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)
|
* LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)
|
||||||
|
|
|
@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity {
|
||||||
private static final float[] NORM_TABLE = new float[256];
|
private static final float[] NORM_TABLE = new float[256];
|
||||||
|
|
||||||
static {
|
static {
|
||||||
for (int i = 0; i < 256; i++) {
|
for (int i = 1; i < 256; i++) {
|
||||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||||
NORM_TABLE[i] = 1.0f / (f*f);
|
NORM_TABLE[i] = 1.0f / (f*f);
|
||||||
}
|
}
|
||||||
|
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||||
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
|
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
|
||||||
* and {@code N} is increased by {@code F}
|
* and {@code N} is increased by {@code F}
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
* NOTE: in some corner cases this model may give poor performance with Normalizations that
|
* NOTE: in some corner cases this model may give poor performance or infinite scores with
|
||||||
* return large values for {@code tfn} such as NormalizationH3. Consider using the
|
* Normalizations that return large or small values for {@code tfn} such as NormalizationH3.
|
||||||
* geometric approximation ({@link BasicModelG}) instead, which provides the same relevance
|
* Consider using the geometric approximation ({@link BasicModelG}) instead, which provides
|
||||||
* but with less practical problems.
|
* the same relevance but with less practical problems.
|
||||||
*/
|
*/
|
||||||
public class BasicModelBE extends BasicModel {
|
public class BasicModelBE extends BasicModel {
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||||
* <p>
|
* <p>
|
||||||
* WARNING: for terms that do not meet the expected random distribution
|
* WARNING: for terms that do not meet the expected random distribution
|
||||||
* (e.g. stopwords), this model may give poor performance, such as
|
* (e.g. stopwords), this model may give poor performance, such as
|
||||||
* abnormally high scores for low tf values.
|
* abnormally high or NaN scores for low tf values.
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class BasicModelD extends BasicModel {
|
public class BasicModelD extends BasicModel {
|
||||||
|
|
|
@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities;
|
||||||
* <p>Unlike for DFR, the natural logarithm is used, as
|
* <p>Unlike for DFR, the natural logarithm is used, as
|
||||||
* it is faster to compute and the original paper does not express any
|
* it is faster to compute and the original paper does not express any
|
||||||
* preference to a specific base.</p>
|
* preference to a specific base.</p>
|
||||||
|
* WARNING: this model currently returns infinite scores for very small
|
||||||
|
* tf values and negative scores for very large tf values
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class DistributionSPL extends Distribution {
|
public class DistributionSPL extends Distribution {
|
||||||
|
|
|
@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity {
|
||||||
private static final float[] NORM_TABLE = new float[256];
|
private static final float[] NORM_TABLE = new float[256];
|
||||||
|
|
||||||
static {
|
static {
|
||||||
for (int i = 0; i < 256; i++) {
|
for (int i = 1; i < 256; i++) {
|
||||||
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
||||||
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
||||||
}
|
}
|
||||||
|
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.lucene.search.similarities;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestBM25Similarity extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testSaneNormValues() {
|
||||||
|
BM25Similarity sim = new BM25Similarity();
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
float len = sim.decodeNormValue((byte) i);
|
||||||
|
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
|
||||||
|
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
|
||||||
|
assertFalse("nan len for byte=" + i, Float.isNaN(len));
|
||||||
|
if (i > 0) {
|
||||||
|
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase {
|
||||||
assertEquals(1, topDocs.scoreDocs.length);
|
assertEquals(1, topDocs.scoreDocs.length);
|
||||||
assertTrue(topDocs.scoreDocs[0].score != 0);
|
assertTrue(topDocs.scoreDocs[0].score != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSaneNormValues() {
|
||||||
|
ClassicSimilarity sim = new ClassicSimilarity();
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
float boost = sim.decodeNormValue((byte) i);
|
||||||
|
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
|
||||||
|
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
|
||||||
|
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
|
||||||
|
if (i > 0) {
|
||||||
|
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase {
|
||||||
TopDocs td = is.search(query, 10);
|
TopDocs td = is.search(query, 10);
|
||||||
assertEquals(1, td.totalHits);
|
assertEquals(1, td.totalHits);
|
||||||
float score = td.scoreDocs[0].score;
|
float score = td.scoreDocs[0].score;
|
||||||
assertTrue(score >= 0.0f);
|
assertFalse("negative score for " + sim, score < 0.0f);
|
||||||
assertFalse("inf score for " + sim, Float.isInfinite(score));
|
assertFalse("inf score for " + sim, Float.isInfinite(score));
|
||||||
|
assertFalse("nan score for " + sim, Float.isNaN(score));
|
||||||
}
|
}
|
||||||
ir.close();
|
ir.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
|
@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase {
|
||||||
actual.setDiscountOverlaps(true);
|
actual.setDiscountOverlaps(true);
|
||||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSaneNormValues() {
|
||||||
|
for (SimilarityBase sim : sims) {
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
float len = sim.decodeNormValue((byte) i);
|
||||||
|
assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
|
||||||
|
assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
|
||||||
|
assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
|
||||||
|
if (i > 0) {
|
||||||
|
assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* make sure the similarity does not go crazy when tested against all possible norm values.
|
||||||
|
*/
|
||||||
|
public void testCrazyIndexTimeBoosts() throws Exception {
|
||||||
|
long avgLength = 750;
|
||||||
|
long docCount = 500000;
|
||||||
|
long numTokens = docCount * avgLength;
|
||||||
|
|
||||||
|
CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
|
||||||
|
|
||||||
|
long docFreq = 2000;
|
||||||
|
long totalTermFreq = 2000 * avgLength;
|
||||||
|
|
||||||
|
TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
|
||||||
|
|
||||||
|
for (SimilarityBase sim : sims) {
|
||||||
|
if (sim instanceof IBSimilarity) {
|
||||||
|
if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
|
||||||
|
// score goes infinite for tiny doc lengths and negative for huge doc lengths
|
||||||
|
// TODO: fix this
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (sim instanceof DFRSimilarity) {
|
||||||
|
BasicModel model = ((DFRSimilarity)sim).getBasicModel();
|
||||||
|
if (model instanceof BasicModelD || model instanceof BasicModelP) {
|
||||||
|
// score goes NaN for tiny doc lengths
|
||||||
|
// TODO: fix this
|
||||||
|
continue;
|
||||||
|
} else if (model instanceof BasicModelBE) {
|
||||||
|
// score goes negative infinity for tiny doc lengths
|
||||||
|
// TODO: fix this
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats);
|
||||||
|
for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
float len = sim.decodeNormValue((byte) i);
|
||||||
|
float score = sim.score(stats, tf, len);
|
||||||
|
assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
|
||||||
|
assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
|
||||||
|
assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue