mirror of https://github.com/apache/lucene.git
similarities: provide default computeNorm implementation; remove remaining discountOverlaps setters; (#13757)
Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
parent
f778cc4924
commit
7c056ab88c
|
@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -33,7 +31,6 @@ import org.apache.lucene.util.SmallFloat;
|
|||
public class BM25Similarity extends Similarity {
|
||||
private final float k1;
|
||||
private final float b;
|
||||
private final boolean discountOverlaps;
|
||||
|
||||
/**
|
||||
* BM25 with the supplied parameter values.
|
||||
|
@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity {
|
|||
* within the range {@code [0..1]}
|
||||
*/
|
||||
public BM25Similarity(float k1, float b, boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
if (Float.isFinite(k1) == false || k1 < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"illegal k1 value: " + k1 + ", must be a non-negative finite value");
|
||||
|
@ -55,7 +53,6 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
this.discountOverlaps = discountOverlaps;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -110,15 +107,6 @@ public class BM25Similarity extends Similarity {
|
|||
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
*
|
||||
* @see #BM25Similarity(float, float, boolean)
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
|
@ -128,19 +116,6 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes a score factor for a simple term and returns an explanation for that score factor.
|
||||
*
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -25,22 +24,15 @@ import org.apache.lucene.search.TermStatistics;
|
|||
* Simple similarity that gives terms a score that is equal to their query boost. This similarity is
|
||||
* typically used with disabled norms since neither document statistics nor index statistics are
|
||||
* used for scoring. That said, if norms are enabled, they will be computed the same way as {@link
|
||||
* SimilarityBase} and {@link BM25Similarity} with {@link
|
||||
* SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity}
|
||||
* can be changed after the index has been created.
|
||||
* SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps()
|
||||
* discounted overlaps} so that the {@link Similarity} can be changed after the index has been
|
||||
* created.
|
||||
*/
|
||||
public class BooleanSimilarity extends Similarity {
|
||||
|
||||
private static final Similarity BM25_SIM = new BM25Similarity();
|
||||
|
||||
/** Sole constructor */
|
||||
public BooleanSimilarity() {}
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return BM25_SIM.computeNorm(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer scorer(
|
||||
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
|
|
|
@ -26,8 +26,15 @@ import org.apache.lucene.search.TermStatistics;
|
|||
*/
|
||||
public class ClassicSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public ClassicSimilarity() {}
|
||||
/** Default constructor: parameter-free */
|
||||
public ClassicSimilarity() {
|
||||
super();
|
||||
}
|
||||
|
||||
/** Primary constructor. */
|
||||
public ClassicSimilarity(boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implemented as <code>1/sqrt(length)</code>.
|
||||
|
|
|
@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
*/
|
||||
public DFRSimilarity(
|
||||
BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) {
|
||||
this(basicModel, afterEffect, normalization, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates DFRSimilarity from the three components.
|
||||
*
|
||||
* <p>Note that <code>null</code> values are not allowed: if you want no normalization, instead
|
||||
* pass {@link NoNormalization}.
|
||||
*
|
||||
* @param basicModel Basic model of information content
|
||||
* @param afterEffect First normalization of information gain
|
||||
* @param normalization Second (length) normalization
|
||||
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
|
||||
* are discounted from the document's length.
|
||||
*/
|
||||
public DFRSimilarity(
|
||||
BasicModel basicModel,
|
||||
AfterEffect afterEffect,
|
||||
Normalization normalization,
|
||||
boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
if (basicModel == null || afterEffect == null || normalization == null) {
|
||||
throw new NullPointerException("null parameters not allowed.");
|
||||
}
|
||||
|
|
|
@ -17,8 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Collections;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -45,7 +47,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* is in this norm, but it is most useful for encoding length normalization information.
|
||||
*
|
||||
* <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
|
||||
* {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
|
||||
* default implementation encodes length normalization information with {@link SmallFloat} into a
|
||||
* single byte, this might not be suitable for all purposes.
|
||||
*
|
||||
* <p>Many formulas require the use of average document length, which can be computed via a
|
||||
|
@ -88,13 +90,49 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Similarity {
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
// Explicitly declared so that we have non-empty javadoc
|
||||
protected Similarity() {}
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
|
||||
* document's length.
|
||||
*/
|
||||
private final boolean discountOverlaps;
|
||||
|
||||
/**
|
||||
* Computes the normalization value for a field, given the accumulated state of term processing
|
||||
* for this field (see {@link FieldInvertState}).
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public final boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
/** Default constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
protected Similarity() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time.
|
||||
*
|
||||
* <p>Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute}
|
||||
* of zero from the analysis chain.
|
||||
*
|
||||
* <p><b>NOTE</b>: If you modify this parameter, you'll need to re-index for it to take effect.
|
||||
*
|
||||
* @param discountOverlaps true if overlap tokens should not impact document length for scoring.
|
||||
*/
|
||||
protected Similarity(boolean discountOverlaps) {
|
||||
this.discountOverlaps = discountOverlaps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the normalization value for a field at index-time.
|
||||
*
|
||||
* <p>The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms
|
||||
* as a single byte.
|
||||
*
|
||||
* <p><b>WARNING</b>: The default implementation is used by Lucene's supplied Similarity classes,
|
||||
* which means you can change the Similarity at runtime without reindexing. If you override this
|
||||
* method, you'll need to re-index documents for it to take effect.
|
||||
*
|
||||
* <p>Matches in longer fields are less precise, so implementations of this method usually set
|
||||
* smaller values when <code>state.getLength()</code> is large, and larger values when <code>
|
||||
|
@ -108,10 +146,20 @@ public abstract class Similarity {
|
|||
* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @param state current processing state for this field
|
||||
* @param state accumulated state of term processing for this field
|
||||
* @return computed norm value
|
||||
*/
|
||||
public abstract long computeNorm(FieldInvertState state);
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity {
|
|||
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
|
||||
private static final double LOG_2 = Math.log(2);
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
|
||||
* document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
public SimilarityBase() {}
|
||||
|
||||
/**
|
||||
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
/** Default constructor: parameter-free */
|
||||
public SimilarityBase() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
*
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
/** Primary constructor. */
|
||||
public SimilarityBase(boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -179,20 +158,6 @@ public abstract class SimilarityBase extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
/** Encodes the document length in the same way as {@link BM25Similarity}. */
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
// ----------------------------- Static methods ------------------------------
|
||||
|
||||
/** Returns the base two logarithm of {@code x}. */
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -326,33 +324,14 @@ import org.apache.lucene.util.SmallFloat;
|
|||
*/
|
||||
public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
public TFIDFSimilarity() {}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
|
||||
* document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/**
|
||||
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
/** Default constructor: parameter-free */
|
||||
public TFIDFSimilarity() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
*
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
/** Primary constructor. */
|
||||
public TFIDFSimilarity(boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -438,7 +417,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
/**
|
||||
* Compute an index-time normalization value for this field instance.
|
||||
*
|
||||
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean)
|
||||
* @param length the number of terms in the field, optionally {@link #getDiscountOverlaps()
|
||||
* discounting overlaps}
|
||||
* @return a length normalization value
|
||||
*/
|
||||
|
@ -453,19 +432,6 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer scorer(
|
||||
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
|
|
|
@ -526,17 +526,17 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
|
||||
// LUCENE-5221
|
||||
public void testDiscountOverlapsBoost() throws IOException {
|
||||
BM25Similarity expected = new BM25Similarity(false);
|
||||
SimilarityBase actual =
|
||||
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
|
||||
actual.setDiscountOverlaps(false);
|
||||
final BM25Similarity expected0 = new BM25Similarity(false);
|
||||
final SimilarityBase actual0 =
|
||||
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), false);
|
||||
FieldInvertState state =
|
||||
new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
|
||||
state.setLength(5);
|
||||
state.setNumOverlap(2);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
expected = new BM25Similarity();
|
||||
actual.setDiscountOverlaps(true);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
assertEquals(expected0.computeNorm(state), actual0.computeNorm(state));
|
||||
final BM25Similarity expected1 = new BM25Similarity(true);
|
||||
final SimilarityBase actual1 =
|
||||
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), true);
|
||||
assertEquals(expected1.computeNorm(state), actual1.computeNorm(state));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -395,8 +395,7 @@ public final class SearchImpl extends LukeModel implements Search {
|
|||
Similarity similarity;
|
||||
|
||||
if (config.isUseClassicSimilarity()) {
|
||||
ClassicSimilarity tfidf = new ClassicSimilarity();
|
||||
tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
|
||||
ClassicSimilarity tfidf = new ClassicSimilarity(config.isDiscountOverlaps());
|
||||
similarity = tfidf;
|
||||
} else {
|
||||
BM25Similarity bm25 =
|
||||
|
|
|
@ -45,10 +45,16 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
|
|||
private double tf_hyper_base = 1.3d;
|
||||
private float tf_hyper_xoffset = 10.0f;
|
||||
|
||||
/** Default constructor: parameter-free */
|
||||
public SweetSpotSimilarity() {
|
||||
super();
|
||||
}
|
||||
|
||||
/** Primary constructor. */
|
||||
public SweetSpotSimilarity(boolean discountOverlaps) {
|
||||
super(discountOverlaps);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the baseline and minimum function variables for baselineTf
|
||||
*
|
||||
|
@ -82,11 +88,10 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
|
|||
*
|
||||
* @see #lengthNorm
|
||||
*/
|
||||
public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) {
|
||||
public void setLengthNormFactors(int min, int max, float steepness) {
|
||||
this.ln_min = min;
|
||||
this.ln_max = max;
|
||||
this.ln_steep = steepness;
|
||||
this.discountOverlaps = discountOverlaps;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -74,7 +74,7 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
|
|||
public void testSweetSpotComputeNorm() throws IOException {
|
||||
|
||||
final SweetSpotSimilarity ss = new SweetSpotSimilarity();
|
||||
ss.setLengthNormFactors(1, 1, 0.5f, true);
|
||||
ss.setLengthNormFactors(1, 1, 0.5f);
|
||||
|
||||
Similarity d = new ClassicSimilarity();
|
||||
Similarity s = ss;
|
||||
|
@ -87,7 +87,7 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
|
|||
|
||||
// make a sweet spot
|
||||
|
||||
ss.setLengthNormFactors(3, 10, 0.5f, true);
|
||||
ss.setLengthNormFactors(3, 10, 0.5f);
|
||||
|
||||
for (int i = 3; i <= 10; i++) {
|
||||
assertEquals("3,10: spot i=" + i, 1.0f, computeNorm(ss, "bogus", i), 0.0f);
|
||||
|
@ -101,14 +101,14 @@ public class TestSweetSpotSimilarity extends LuceneTestCase {
|
|||
|
||||
// separate sweet spot for certain fields
|
||||
|
||||
final SweetSpotSimilarity ssBar = new SweetSpotSimilarity();
|
||||
ssBar.setLengthNormFactors(8, 13, 0.5f, false);
|
||||
final SweetSpotSimilarity ssYak = new SweetSpotSimilarity();
|
||||
ssYak.setLengthNormFactors(6, 9, 0.5f, false);
|
||||
final SweetSpotSimilarity ssA = new SweetSpotSimilarity();
|
||||
ssA.setLengthNormFactors(5, 8, 0.5f, false);
|
||||
final SweetSpotSimilarity ssB = new SweetSpotSimilarity();
|
||||
ssB.setLengthNormFactors(5, 8, 0.1f, false);
|
||||
final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(false);
|
||||
ssBar.setLengthNormFactors(8, 13, 0.5f);
|
||||
final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(false);
|
||||
ssYak.setLengthNormFactors(6, 9, 0.5f);
|
||||
final SweetSpotSimilarity ssA = new SweetSpotSimilarity(false);
|
||||
ssA.setLengthNormFactors(5, 8, 0.5f);
|
||||
final SweetSpotSimilarity ssB = new SweetSpotSimilarity(false);
|
||||
ssB.setLengthNormFactors(5, 8, 0.1f);
|
||||
|
||||
Similarity sp =
|
||||
new PerFieldSimilarityWrapper() {
|
||||
|
|
Loading…
Reference in New Issue