LUCENE-3299: refactoring of Similarity.sloppyFreq and scorePayload

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-12 21:37:19 +00:00
parent dcc4e12388
commit d1203a28aa
19 changed files with 123 additions and 95 deletions

View File

@ -156,7 +156,7 @@ Changes in backwards compatibility policy
the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt
for more information (Chris Male)
* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you
* LUCENE-2392, LUCENE-3299: Decoupled vector space scoring from Query/Weight/Scorer. If you
extended Similarity directly before, you should extend TFIDFSimilarity instead.
Similarity is now a lower-level API to implement other scoring algorithms.
See MIGRATE.txt for more details.

View File

@ -393,6 +393,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has
been renamed to Weight.getValueForNormalization().
The scorePayload method now takes a BytesRef. It is never null.
* LUCENE-3283: Lucene's core o.a.l.queryParser QueryParsers have been consolidated into module/queryparser,
where other QueryParsers from the codebase will also be placed. The following classes were moved:
- o.a.l.queryParser.CharStream -> o.a.l.queryparser.classic.CharStream

View File

@ -1,6 +1,7 @@
package org.apache.lucene.search;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.BytesRef;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -51,7 +52,13 @@ public class DefaultSimilarity extends TFIDFSimilarity {
public float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
/** The default implementation returns <code>1</code> */
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
@Override
public float idf(int docFreq, int numDocs) {

View File

@ -221,8 +221,7 @@ public class MultiPhraseQuery extends Query {
return s;
}
} else {
return new SloppyPhraseScorer(this, postingsFreqs, similarity,
slop, similarity.sloppyDocScorer(stats, field, context));
return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.sloppyDocScorer(stats, field, context));
}
}

View File

@ -251,7 +251,7 @@ public class PhraseQuery extends Query {
}
} else {
return
new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context));
new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.sloppyDocScorer(stats, field, context));
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat; // javadoc
import org.apache.lucene.util.TermContext;
@ -100,9 +101,6 @@ import org.apache.lucene.util.TermContext;
* @lucene.experimental
*/
public abstract class Similarity {
public static final int NO_DOC_ID_PROVIDED = -1;
/**
* Computes the normalization value for a field, given the accumulated
* state of term processing for this field (see {@link FieldInvertState}).
@ -120,43 +118,6 @@ public abstract class Similarity {
* @return the calculated byte norm
*/
public abstract byte computeNorm(FieldInvertState state);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency to be used in scoring instead of the exact term count.
*
* <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* @see PhraseQuery#setSlop(int)
* @param distance the edit distance of this sloppy phrase match
* @return the frequency increment for this match
*/
public abstract float sloppyFreq(int distance);
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
* what is in the byte array.
* <p>
* The default implementation returns 1.
*
* @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information
* @param start The start position of the payload
* @param end The end position of the payload
* @param payload The payload byte array to be scored
* @param offset The offset into the payload array
* @param length The length in the array
* @return An implementation dependent float to be used as a scoring factor
*
*/
// TODO: maybe switch this API to BytesRef?
public float scorePayload(int docId, int start, int end, byte [] payload, int offset, int length)
{
return 1;
}
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
@ -216,6 +177,12 @@ public abstract class Similarity {
* @return document's score
*/
public abstract float score(int doc, float freq);
/** Computes the amount of a sloppy phrase match, based on an edit distance. */
public abstract float computeSlopFactor(int distance);
/** Calculate a scoring factor based on the data in the payload. */
public abstract float computePayloadFactor(int doc, int start, int end, BytesRef payload);
/**
* Explain the score for a single document

View File

@ -25,13 +25,11 @@ final class SloppyPhraseScorer extends PhraseScorer {
private PhrasePositions repeats[];
private PhrasePositions tmpPos[]; // for flipping repeating pps.
private boolean checkedRepeats;
private final Similarity similarity;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, docScorer);
this.slop = slop;
this.similarity = similarity;
}
/**
@ -80,7 +78,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
int matchLength = end - start;
if (matchLength <= slop)
freq += similarity.sloppyFreq(matchLength); // score match
freq += docScorer.computeSlopFactor(matchLength); // score match
if (pp.position > end)
end = pp.position;

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
@ -666,6 +667,34 @@ public abstract class TFIDFSimilarity extends Similarity {
return SmallFloat.floatToByte315(f);
}
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency to be used in scoring instead of the exact term count.
*
* <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* @see PhraseQuery#setSlop(int)
* @param distance the edit distance of this sloppy phrase match
* @return the frequency increment for this match
*/
public abstract float sloppyFreq(int distance);
/**
* Calculate a scoring factor based on the data in the payload. Implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
* what is in the byte array.
*
* @param doc The docId currently being scored.
* @param start The start position of the payload
* @param end The end position of the payload
* @param payload The payload byte array to be scored
* @return An implementation dependent float to be used as a scoring factor
*/
public abstract float scorePayload(int doc, int start, int end, BytesRef payload);
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
TermContext... termContexts) throws IOException {
@ -736,6 +765,16 @@ public abstract class TFIDFSimilarity extends Similarity {
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public float computeSlopFactor(int distance) {
return sloppyFreq(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return scorePayload(doc, start, end, payload);
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);

View File

@ -23,6 +23,7 @@ import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered;
@ -32,6 +33,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
@ -44,13 +46,13 @@ import java.util.Iterator;
* in the value of the payloads located at each of the positions where the
* {@link org.apache.lucene.search.spans.TermSpans} occurs.
* <p/>
* In order to take advantage of this, you must override
* {@link org.apache.lucene.search.Similarity#scorePayload}
* NOTE: In order to take advantage of this with the default scoring implementation
* ({@link DefaultSimilarity}), you must override {@link DefaultSimilarity#scorePayload(int, int, int, BytesRef)},
* which returns 1 by default.
* <p/>
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
*
* @see org.apache.lucene.search.Similarity#scorePayload
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
*/
public class PayloadNearQuery extends SpanNearQuery {
protected String fieldName;
@ -186,7 +188,7 @@ public class PayloadNearQuery extends SpanNearQuery {
protected PayloadNearSpanScorer(Spans spans, Weight weight,
Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, docScorer);
super(spans, weight, docScorer);
this.spans = spans;
}
@ -209,6 +211,9 @@ public class PayloadNearQuery extends SpanNearQuery {
}
}
// TODO change the whole spans api to use bytesRef, or nuke spans
BytesRef scratch = new BytesRef();
/**
* By default, uses the {@link PayloadFunction} to score the payloads, but
* can be overridden to do other things.
@ -221,9 +226,12 @@ public class PayloadNearQuery extends SpanNearQuery {
*/
protected void processPayloads(Collection<byte[]> payLoads, int start, int end) {
for (final byte[] thePayload : payLoads) {
scratch.bytes = thePayload;
scratch.offset = 0;
scratch.length = thePayload.length;
payloadScore = function.currentScore(doc, fieldName, start, end,
payloadsSeen, payloadScore, similarity.scorePayload(doc,
spans.start(), spans.end(), thePayload, 0, thePayload.length));
payloadsSeen, payloadScore, docScorer.computePayloadFactor(doc,
spans.start(), spans.end(), scratch));
++payloadsSeen;
}
}
@ -240,7 +248,7 @@ public class PayloadNearQuery extends SpanNearQuery {
payloadsSeen = 0;
do {
int matchLength = spans.end() - spans.start();
freq += similarity.sloppyFreq(matchLength);
freq += docScorer.computeSlopFactor(matchLength);
Spans[] spansArr = new Spans[1];
spansArr[0] = spans;
getPayloads(spansArr);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.payloads;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
@ -42,12 +43,13 @@ import java.io.IOException;
* {@link org.apache.lucene.search.spans.SpanTermQuery} except that it factors
* in the value of the payload located at each of the positions where the
* {@link org.apache.lucene.index.Term} occurs.
* <p>
* In order to take advantage of this, you must override
* {@link org.apache.lucene.search.Similarity#scorePayload(int, int, int, byte[],int,int)}
* <p/>
* NOTE: In order to take advantage of this with the default scoring implementation
* ({@link DefaultSimilarity}), you must override {@link DefaultSimilarity#scorePayload(int, int, int, BytesRef)},
* which returns 1 by default.
* <p>
* <p/>
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
**/
public class PayloadTermQuery extends SpanTermQuery {
protected PayloadFunction function;
@ -79,7 +81,7 @@ public class PayloadTermQuery extends SpanTermQuery {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context),
this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
this, similarity.sloppyDocScorer(stats, query.getField(), context));
}
protected class PayloadTermSpanScorer extends SpanScorer {
@ -88,9 +90,8 @@ public class PayloadTermQuery extends SpanTermQuery {
protected int payloadsSeen;
private final TermSpans termSpans;
public PayloadTermSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, docScorer);
public PayloadTermSpanScorer(TermSpans spans, Weight weight, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, docScorer);
termSpans = spans;
}
@ -106,7 +107,7 @@ public class PayloadTermQuery extends SpanTermQuery {
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
freq += similarity.sloppyFreq(matchLength);
freq += docScorer.computeSlopFactor(matchLength);
processPayload(similarity);
more = spans.next();// this moves positions to the next match in this
@ -122,17 +123,10 @@ public class PayloadTermQuery extends SpanTermQuery {
if (payload != null) {
payloadScore = function.currentScore(doc, term.field(),
spans.start(), spans.end(), payloadsSeen, payloadScore,
similarity.scorePayload(doc, spans.start(),
spans.end(), payload.bytes,
payload.offset,
payload.length));
docScorer.computePayloadFactor(doc, spans.start(), spans.end(), payload));
} else {
payloadScore = function.currentScore(doc, term.field(),
spans.start(), spans.end(), payloadsSeen, payloadScore,
similarity.scorePayload(doc, spans.start(),
spans.end(), null,
0,
0));
spans.start(), spans.end(), payloadsSeen, payloadScore, 1F);
}
payloadsSeen++;

View File

@ -35,13 +35,11 @@ public class SpanScorer extends Scorer {
protected int doc;
protected float freq;
protected final Similarity similarity;
protected final Similarity.SloppyDocScorer docScorer;
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer)
protected SpanScorer(Spans spans, Weight weight, Similarity.SloppyDocScorer docScorer)
throws IOException {
super(weight);
this.similarity = similarity;
this.docScorer = docScorer;
this.spans = spans;
@ -83,7 +81,7 @@ public class SpanScorer extends Scorer {
freq = 0.0f;
do {
int matchLength = spans.end() - spans.start();
freq += similarity.sloppyFreq(matchLength);
freq += docScorer.computeSlopFactor(matchLength);
more = spans.next();
} while (more && (doc == spans.doc()));
return true;

View File

@ -67,7 +67,7 @@ public class SpanWeight extends Weight {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
return new SpanScorer(query.getSpans(context), this, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util._TestUtil;
@ -48,6 +49,7 @@ public class TestOmitTf extends LuceneTestCase {
@Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException {
return new Explanation(1.0f, "Inexplicable");
}
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 1.0f; }
};
}
}

View File

@ -265,11 +265,6 @@ final class JustCompileSearch {
public byte computeNorm(FieldInvertState state) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public float sloppyFreq(int distance) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
}
static final class JustCompileSimilarityProvider implements SimilarityProvider {

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
@ -144,11 +145,6 @@ public class TestDocValuesScoring extends LuceneTestCase {
return sim.computeNorm(state);
}
@Override
public float sloppyFreq(int distance) {
return sim.sloppyFreq(distance);
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
return sim.computeStats(searcher, fieldName, queryBoost, termContexts);
@ -188,6 +184,16 @@ public class TestDocValuesScoring extends LuceneTestCase {
return (float) values.getFloat(doc) * sub.score(doc, freq);
}
@Override
public float computeSlopFactor(int distance) {
return sub.computeSlopFactor(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return sub.computePayloadFactor(doc, start, end, payload);
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")");

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.MultiNorms;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestSimilarityProvider extends LuceneTestCase {
@ -125,6 +126,11 @@ public class TestSimilarityProvider extends LuceneTestCase {
public float idf(int docFreq, int numDocs) {
return 1f;
}
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1f;
}
}
private class Sim2 extends TFIDFSimilarity {
@ -147,5 +153,10 @@ public class TestSimilarityProvider extends LuceneTestCase {
public float idf(int docFreq, int numDocs) {
return 10f;
}
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1f;
}
}
}

View File

@ -42,6 +42,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
@ -315,9 +316,9 @@ public class TestPayloadNearQuery extends LuceneTestCase {
return new DefaultSimilarity() {
@Override
public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
public float scorePayload(int docId, int start, int end, BytesRef payload) {
//we know it is size 4 here, so ignore the offset/length
return payload[offset];
return payload.bytes[payload.offset];
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.payloads;
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.English;
import org.apache.lucene.search.DefaultSimilarityProvider;
@ -309,9 +310,9 @@ public class TestPayloadTermQuery extends LuceneTestCase {
// TODO: Remove warning after API has been finalized
@Override
public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
public float scorePayload(int docId, int start, int end, BytesRef payload) {
//we know it is size 4 here, so ignore the offset/length
return payload[offset];
return payload.bytes[payload.offset];
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -135,8 +135,8 @@ final class JustCompileSearchSpans {
static final class JustCompileSpanScorer extends SpanScorer {
protected JustCompileSpanScorer(Spans spans, Weight weight,
Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, docScorer);
Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, docScorer);
}
@Override