similarities: provide default computeNorm implementation; remove remaining discountOverlaps setters; (#13757)

Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
Christine Poerschke 2024-09-13 09:27:35 +01:00 committed by GitHub
parent f778cc4924
commit 7c056ab88c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 129 additions and 151 deletions

View File

@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -33,7 +31,6 @@ import org.apache.lucene.util.SmallFloat;
public class BM25Similarity extends Similarity {
private final float k1;
private final float b;
private final boolean discountOverlaps;
/**
* BM25 with the supplied parameter values.
@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity {
* within the range {@code [0..1]}
*/
public BM25Similarity(float k1, float b, boolean discountOverlaps) {
super(discountOverlaps);
if (Float.isFinite(k1) == false || k1 < 0) {
throw new IllegalArgumentException(
"illegal k1 value: " + k1 + ", must be a non-negative finite value");
@ -55,7 +53,6 @@ public class BM25Similarity extends Similarity {
}
this.k1 = k1;
this.b = b;
this.discountOverlaps = discountOverlaps;
}
/**
@ -110,15 +107,6 @@ public class BM25Similarity extends Similarity {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}
/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #BM25Similarity(float, float, boolean)
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
/** Cache of decoded bytes. */
private static final float[] LENGTH_TABLE = new float[256];
@ -128,19 +116,6 @@ public class BM25Similarity extends Similarity {
}
}
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
/**
* Computes a score factor for a simple term and returns an explanation for that score factor.
*

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -25,22 +24,15 @@ import org.apache.lucene.search.TermStatistics;
* Simple similarity that gives terms a score that is equal to their query boost. This similarity is
* typically used with disabled norms since neither document statistics nor index statistics are
* used for scoring. That said, if norms are enabled, they will be computed the same way as {@link
* SimilarityBase} and {@link BM25Similarity} with {@link
* SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity}
* can be changed after the index has been created.
* SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps()
* discounted overlaps} so that the {@link Similarity} can be changed after the index has been
* created.
*/
public class BooleanSimilarity extends Similarity {
private static final Similarity BM25_SIM = new BM25Similarity();
/** Sole constructor */
public BooleanSimilarity() {}
@Override
public long computeNorm(FieldInvertState state) {
return BM25_SIM.computeNorm(state);
}
@Override
public SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {

View File

@ -26,8 +26,15 @@ import org.apache.lucene.search.TermStatistics;
*/
public class ClassicSimilarity extends TFIDFSimilarity {
/** Sole constructor: parameter-free */
public ClassicSimilarity() {}
/** Default constructor: parameter-free */
public ClassicSimilarity() {
super();
}
/** Primary constructor. */
public ClassicSimilarity(boolean discountOverlaps) {
super(discountOverlaps);
}
/**
* Implemented as <code>1/sqrt(length)</code>.

View File

@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase {
*/
public DFRSimilarity(
BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) {
this(basicModel, afterEffect, normalization, true);
}
/**
* Creates DFRSimilarity from the three components.
*
* <p>Note that <code>null</code> values are not allowed: if you want no normalization, instead
* pass {@link NoNormalization}.
*
* @param basicModel Basic model of information content
* @param afterEffect First normalization of information gain
* @param normalization Second (length) normalization
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
* are discounted from the document's length.
*/
public DFRSimilarity(
BasicModel basicModel,
AfterEffect afterEffect,
Normalization normalization,
boolean discountOverlaps) {
super(discountOverlaps);
if (basicModel == null || afterEffect == null || normalization == null) {
throw new NullPointerException("null parameters not allowed.");
}

View File

@ -17,8 +17,10 @@
package org.apache.lucene.search.similarities;
import java.util.Collections;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@ -45,7 +47,7 @@ import org.apache.lucene.util.SmallFloat;
* is in this norm, but it is most useful for encoding length normalization information.
*
* <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
* {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
* default implementation encodes length normalization information with {@link SmallFloat} into a
* single byte, this might not be suitable for all purposes.
*
* <p>Many formulas require the use of average document length, which can be computed via a
@ -88,13 +90,49 @@ import org.apache.lucene.util.SmallFloat;
* @lucene.experimental
*/
public abstract class Similarity {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
// Explicitly declared so that we have non-empty javadoc
protected Similarity() {}
/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
private final boolean discountOverlaps;
/**
* Computes the normalization value for a field, given the accumulated state of term processing
* for this field (see {@link FieldInvertState}).
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #computeNorm
*/
public final boolean getDiscountOverlaps() {
return discountOverlaps;
}
/** Default constructor. (For invocation by subclass constructors, typically implicit.) */
protected Similarity() {
this(true);
}
/**
* Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time.
*
* <p>Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute}
* of zero from the analysis chain.
*
* <p><b>NOTE</b>: If you modify this parameter, you'll need to re-index for it to take effect.
*
* @param discountOverlaps true if overlap tokens should not impact document length for scoring.
*/
protected Similarity(boolean discountOverlaps) {
this.discountOverlaps = discountOverlaps;
}
/**
* Computes the normalization value for a field at index-time.
*
* <p>The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms
* as a single byte.
*
* <p><b>WARNING</b>: The default implementation is used by Lucene's supplied Similarity classes,
* which means you can change the Similarity at runtime without reindexing. If you override this
* method, you'll need to re-index documents for it to take effect.
*
* <p>Matches in longer fields are less precise, so implementations of this method usually set
* smaller values when <code>state.getLength()</code> is large, and larger values when <code>
@ -108,10 +146,20 @@ public abstract class Similarity {
* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
*
* @lucene.experimental
* @param state current processing state for this field
* @param state accumulated state of term processing for this field
* @return computed norm value
*/
public abstract long computeNorm(FieldInvertState state);
public long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
/**
* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring

View File

@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity {
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);
/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
protected boolean discountOverlaps = true;
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
public SimilarityBase() {}
/**
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
*
* @lucene.experimental
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
/** Default constructor: parameter-free */
public SimilarityBase() {
super();
}
/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
/** Primary constructor. */
public SimilarityBase(boolean discountOverlaps) {
super(discountOverlaps);
}
@Override
@ -179,20 +158,6 @@ public abstract class SimilarityBase extends Similarity {
}
}
/** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */

View File

@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@ -326,33 +324,14 @@ import org.apache.lucene.util.SmallFloat;
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
public TFIDFSimilarity() {}
/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
protected boolean discountOverlaps = true;
/**
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
*
* @lucene.experimental
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
/** Default constructor: parameter-free */
public TFIDFSimilarity() {
super();
}
/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
/** Primary constructor. */
public TFIDFSimilarity(boolean discountOverlaps) {
super(discountOverlaps);
}
/**
@ -438,7 +417,7 @@ public abstract class TFIDFSimilarity extends Similarity {
/**
* Compute an index-time normalization value for this field instance.
*
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean)
* @param length the number of terms in the field, optionally {@link #getDiscountOverlaps()
* discounting overlaps}
* @return a length normalization value
*/
@ -453,19 +432,6 @@ public abstract class TFIDFSimilarity extends Similarity {
}
}
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
@Override
public final SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {

View File

@ -526,17 +526,17 @@ public class TestSimilarityBase extends LuceneTestCase {
// LUCENE-5221
public void testDiscountOverlapsBoost() throws IOException {
BM25Similarity expected = new BM25Similarity(false);
SimilarityBase actual =
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
actual.setDiscountOverlaps(false);
final BM25Similarity expected0 = new BM25Similarity(false);
final SimilarityBase actual0 =
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), false);
FieldInvertState state =
new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
state.setLength(5);
state.setNumOverlap(2);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
expected = new BM25Similarity();
actual.setDiscountOverlaps(true);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
assertEquals(expected0.computeNorm(state), actual0.computeNorm(state));
final BM25Similarity expected1 = new BM25Similarity(true);
final SimilarityBase actual1 =
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), true);
assertEquals(expected1.computeNorm(state), actual1.computeNorm(state));
}
}

View File

@ -395,8 +395,7 @@ public final class SearchImpl extends LukeModel implements Search {
Similarity similarity;
if (config.isUseClassicSimilarity()) {
ClassicSimilarity tfidf = new ClassicSimilarity();
tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
ClassicSimilarity tfidf = new ClassicSimilarity(config.isDiscountOverlaps());
similarity = tfidf;
} else {
BM25Similarity bm25 =

View File

@ -45,10 +45,16 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
private double tf_hyper_base = 1.3d;
private float tf_hyper_xoffset = 10.0f;
/** Default constructor: parameter-free */
public SweetSpotSimilarity() {
super();
}
/** Primary constructor. */
public SweetSpotSimilarity(boolean discountOverlaps) {
super(discountOverlaps);
}
/**
* Sets the baseline and minimum function variables for baselineTf
*
@ -82,11 +88,10 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
*
* @see #lengthNorm
*/
public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) {
public void setLengthNormFactors(int min, int max, float steepness) {
this.ln_min = min;
this.ln_max = max;
this.ln_steep = steepness;
this.discountOverlaps = discountOverlaps;
}
/**

View File

@ -74,7 +74,7 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
public void testSweetSpotComputeNorm() throws IOException {
final SweetSpotSimilarity ss = new SweetSpotSimilarity();
ss.setLengthNormFactors(1, 1, 0.5f, true);
ss.setLengthNormFactors(1, 1, 0.5f);
Similarity d = new ClassicSimilarity();
Similarity s = ss;
@ -87,7 +87,7 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
// make a sweet spot
ss.setLengthNormFactors(3, 10, 0.5f, true);
ss.setLengthNormFactors(3, 10, 0.5f);
for (int i = 3; i <= 10; i++) {
assertEquals("3,10: spot i=" + i, 1.0f, computeNorm(ss, "bogus", i), 0.0f);
@ -101,14 +101,14 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
// separate sweet spot for certain fields
final SweetSpotSimilarity ssBar = new SweetSpotSimilarity();
ssBar.setLengthNormFactors(8, 13, 0.5f, false);
final SweetSpotSimilarity ssYak = new SweetSpotSimilarity();
ssYak.setLengthNormFactors(6, 9, 0.5f, false);
final SweetSpotSimilarity ssA = new SweetSpotSimilarity();
ssA.setLengthNormFactors(5, 8, 0.5f, false);
final SweetSpotSimilarity ssB = new SweetSpotSimilarity();
ssB.setLengthNormFactors(5, 8, 0.1f, false);
final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(false);
ssBar.setLengthNormFactors(8, 13, 0.5f);
final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(false);
ssYak.setLengthNormFactors(6, 9, 0.5f);
final SweetSpotSimilarity ssA = new SweetSpotSimilarity(false);
ssA.setLengthNormFactors(5, 8, 0.5f);
final SweetSpotSimilarity ssB = new SweetSpotSimilarity(false);
ssB.setLengthNormFactors(5, 8, 0.1f);
Similarity sp =
new PerFieldSimilarityWrapper() {