mirror of https://github.com/apache/lucene.git
LUCENE-2959: add state of the art ranking to Lucene
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1169470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82649a21b4
commit
cfaf91c739
|
@ -495,6 +495,34 @@ New features
|
|||
* LUCENE-3423: add Terms.getDocCount(), which returns the number of documents
|
||||
that have at least one term for a field. (Yonik Seeley, Robert Muir)
|
||||
|
||||
* LUCENE-2959: Added a variety of different relevance ranking systems to Lucene.
|
||||
|
||||
- Added Okapi BM25, Language Models, Divergence from Randomness, and
|
||||
Information-Based Models. The models are pluggable, support all of lucene's
|
||||
features (boosts, slops, explanations, etc) and queries (spans, etc).
|
||||
|
||||
- All models default to the same index-time norm encoding as DefaultSimilarity:
|
||||
so you can easily try these out/switch back and forth/run experiments and
|
||||
comparisons without reindexing. Note: most of the models do rely upon index
|
||||
statistics that are new in Lucene 4.0, so for existing 3.x indexes its a good
|
||||
idea to upgrade your index to the new format with IndexUpgrader first.
|
||||
|
||||
- Added a new subclass SimilarityBase which provides a simplified API
|
||||
for plugging in new ranking algorithms without dealing with all of the
|
||||
nuances and implementation details of Lucene.
|
||||
|
||||
- Added a new helper class BasicSimilarityProvider that just applies one
|
||||
scoring algorithm to all fields, with queryNorm() and coord() returning 1.
|
||||
In general, it is recommended to disable coord() when using the new models.
|
||||
For example, to use BM25 for all fields:
|
||||
searcher.setSimilarityProvider(new BasicSimilarityProvider(new BM25Similarity()));
|
||||
|
||||
If you instead want to apply different similarities (e.g. ones with different
|
||||
parameter values or different algorithms entirely) to different fields, implement
|
||||
SimilarityProvider with your per-field logic.
|
||||
|
||||
(David Mark Nemeskey via Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.index.IndexableField;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermVectorOffsetInfo;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
|
|
@ -57,8 +57,8 @@ import org.apache.lucene.search.Collector;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.RAMDirectory; // for javadocs
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
|
|
@ -22,9 +22,9 @@ import java.util.Date;
|
|||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
package org.apache.lucene.misc;
|
||||
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
|
||||
/**
|
||||
|
|
|
@ -26,13 +26,13 @@ import org.apache.lucene.document.StringField;
|
|||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -18,11 +18,11 @@
|
|||
|
||||
package org.apache.lucene.misc;
|
||||
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
|
||||
|
|
|
@ -31,13 +31,13 @@ import org.apache.lucene.index.IndexWriter;
|
|||
import org.apache.lucene.index.MultiNorms;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -31,6 +31,8 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
|
|
@ -223,14 +223,14 @@ public class Field implements IndexableField {
|
|||
* document.
|
||||
*
|
||||
* <p>The boost is used to compute the norm factor for the field. By
|
||||
* default, in the {@link org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method,
|
||||
* default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)} method,
|
||||
* the boost value is multiplied by the length normalization factor and then
|
||||
* rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
|
||||
* rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
|
||||
* index. One should attempt to ensure that this product does not overflow
|
||||
* the range of that encoding.
|
||||
*
|
||||
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
|
||||
* @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
|
||||
* @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)
|
||||
* @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)
|
||||
*/
|
||||
public void setBoost(float boost) {
|
||||
this.boost = boost;
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
|
|||
import org.apache.lucene.index.DocumentsWriterPerThreadPool.ThreadState;
|
||||
import org.apache.lucene.index.FieldInfos.FieldNumberBiMap;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ import java.text.NumberFormat;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FlushInfo;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.lucene.index.codecs.CodecProvider;
|
|||
import org.apache.lucene.index.codecs.PerDocValues;
|
||||
import org.apache.lucene.index.values.IndexDocValues;
|
||||
import org.apache.lucene.search.FieldCache; // javadocs
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
@ -1012,7 +1012,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
|||
*
|
||||
* @see #norms(String)
|
||||
* @see Similarity#computeNorm(FieldInvertState)
|
||||
* @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
|
||||
* @see org.apache.lucene.search.similarities.DefaultSimilarity#decodeNormValue(byte)
|
||||
* @throws StaleReaderException if the index has changed
|
||||
* since this reader was opened
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
|
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
|
|||
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/** Taps into DocInverter, as an InvertedDocEndConsumer,
|
||||
|
|
|
@ -24,7 +24,8 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.util.ToStringUtils;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs;
|
||||
import org.apache.lucene.search.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.TermQuery.TermWeight;
|
||||
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery.BooleanWeight;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.Scorer.ChildScorer;
|
||||
|
||||
/* See the description in BooleanScorer.java, comparing
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.search.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
final class ExactPhraseScorer extends Scorer {
|
||||
private final int endMinus1;
|
||||
|
|
|
@ -38,6 +38,8 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.NIOFSDirectory; // javadoc
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
|
|
|
@ -26,7 +26,8 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
@ -164,8 +165,7 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
|
||||
if (termArrays.size() == 0) // optimize zero-term case
|
||||
return null;
|
||||
assert !termArrays.isEmpty();
|
||||
final IndexReader reader = context.reader;
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
|
@ -249,7 +249,11 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) {
|
||||
if (termArrays.size() == 1) { // optimize one-term case
|
||||
if (termArrays.isEmpty()) {
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
bq.setBoost(getBoost());
|
||||
return bq;
|
||||
} else if (termArrays.size() == 1) { // optimize one-term case
|
||||
Term[] terms = termArrays.get(0);
|
||||
BooleanQuery boq = new BooleanQuery(true);
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
|
|
|
@ -29,7 +29,8 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
@ -119,7 +120,11 @@ public class PhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
if (terms.size() == 1) {
|
||||
if (terms.isEmpty()) {
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
bq.setBoost(getBoost());
|
||||
return bq;
|
||||
} else if (terms.size() == 1) {
|
||||
TermQuery tq = new TermQuery(terms.get(0));
|
||||
tq.setBoost(getBoost());
|
||||
return tq;
|
||||
|
@ -208,8 +213,7 @@ public class PhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
|
||||
if (terms.size() == 0) // optimize zero-term case
|
||||
return null;
|
||||
assert !terms.isEmpty();
|
||||
final IndexReader reader = context.reader;
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
|
||||
|
@ -285,12 +289,6 @@ public class PhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
if (terms.size() == 1) { // optimize one-term case
|
||||
Term term = terms.get(0);
|
||||
Query termQuery = new TermQuery(term);
|
||||
termQuery.setBoost(getBoost());
|
||||
return termQuery.createWeight(searcher);
|
||||
}
|
||||
return new PhraseWeight(searcher);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/** Expert: Scoring functionality for phrase queries.
|
||||
* <br>A document is considered matching if it contains the phrase-query terms
|
||||
* at "valid" positions. What "valid positions" are
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import java.util.LinkedHashSet;
|
||||
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
final class SloppyPhraseScorer extends PhraseScorer {
|
||||
private int slop;
|
||||
private PhrasePositions repeats[];
|
||||
|
|
|
@ -28,7 +28,8 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
|
||||
*/
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
|
||||
/**
|
||||
* Expert: Calculate query weights and build query scorers.
|
||||
|
|
|
@ -22,10 +22,10 @@ import org.apache.lucene.search.ComplexExplanation;
|
|||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.spans.NearSpansOrdered;
|
||||
import org.apache.lucene.search.spans.NearSpansUnordered;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
|
@ -52,7 +52,7 @@ import java.util.Iterator;
|
|||
* <p/>
|
||||
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
|
||||
*
|
||||
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
|
||||
* @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
|
||||
*/
|
||||
public class PayloadNearQuery extends SpanNearQuery {
|
||||
protected String fieldName;
|
||||
|
|
|
@ -20,16 +20,16 @@ package org.apache.lucene.search.payloads;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.ComplexExplanation;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
|
@ -49,7 +49,7 @@ import java.io.IOException;
|
|||
* which returns 1 by default.
|
||||
* <p/>
|
||||
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
|
||||
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
|
||||
* @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
|
||||
**/
|
||||
public class PayloadTermQuery extends SpanTermQuery {
|
||||
protected PayloadFunction function;
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* This class acts as the base class for the implementations of the <em>first
|
||||
* normalization of the informative content</em> in the DFR framework. This
|
||||
* component is also called the <em>after effect</em> and is defined by the
|
||||
* formula <em>Inf<sub>2</sub> = 1 - Prob<sub>2</sub></em>, where
|
||||
* <em>Prob<sub>2</sub></em> measures the <em>information gain</em>.
|
||||
*
|
||||
* @see DFRSimilarity
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class AfterEffect {
|
||||
/** Returns the aftereffect score. */
|
||||
public abstract float score(BasicStats stats, float tfn);
|
||||
|
||||
/** Returns an explanation for the score. */
|
||||
public abstract Explanation explain(BasicStats stats, float tfn);
|
||||
|
||||
/** Implementation used when there is no aftereffect. */
|
||||
public static final class NoAfterEffect extends AfterEffect {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
return new Explanation(1, "no aftereffect");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the code of the
|
||||
* after effect formula. Refer to the original paper for the list.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Model of the information gain based on the ratio of two Bernoulli processes.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AfterEffectB extends AfterEffect {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
long F = stats.getTotalTermFreq();
|
||||
int n = stats.getDocFreq();
|
||||
return (F + 1) / (n * (tfn + 1));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(score(stats, tfn));
|
||||
result.addDetail(new Explanation(tfn, "tfn"));
|
||||
result.addDetail(new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
|
||||
result.addDetail(new Explanation(stats.getDocFreq(), "docFreq"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "B";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Model of the information gain based on Laplace's law of succession.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class AfterEffectL extends AfterEffect {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
return 1 / (tfn + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(score(stats, tfn));
|
||||
result.addDetail(new Explanation(tfn, "tfn"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "L";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,339 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
/**
|
||||
* BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
|
||||
* Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3.
|
||||
* In Proceedings of the Third Text REtrieval Conference (TREC 1994).
|
||||
* Gaithersburg, USA, November 1994.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BM25Similarity extends Similarity {
|
||||
private final float k1;
|
||||
private final float b;
|
||||
// TODO: should we add a delta like sifaka.cs.uiuc.edu/~ylv2/pub/sigir11-bm25l.pdf ?
|
||||
|
||||
public BM25Similarity(float k1, float b) {
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
/** BM25 with these default values:
|
||||
* <ul>
|
||||
* <li>{@code k1 = 1.2},
|
||||
* <li>{@code b = 0.75}.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public BM25Similarity() {
|
||||
this.k1 = 1.2f;
|
||||
this.b = 0.75f;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
|
||||
protected float idf(int docFreq, int numDocs) {
|
||||
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
|
||||
}
|
||||
|
||||
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||
protected float sloppyFreq(int distance) {
|
||||
return 1.0f / (distance + 1);
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1</code> */
|
||||
protected float scorePayload(int doc, int start, int end, BytesRef payload) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
|
||||
* or returns <code>1</code> if the index does not store sumTotalTermFreq (Lucene 3.x indexes
|
||||
* or any field that omits frequency information). */
|
||||
protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field);
|
||||
if (terms == null) {
|
||||
// field does not exist;
|
||||
return 1f;
|
||||
}
|
||||
long sumTotalTermFreq = terms.getSumTotalTermFreq();
|
||||
long maxdoc = searcher.maxDoc();
|
||||
return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
|
||||
}
|
||||
|
||||
/** The default implementation encodes <code>boost / sqrt(length)</code>
|
||||
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
|
||||
* Lucene's default implementation. If you change this, then you should
|
||||
* change {@link #decodeNormValue(byte)} to match. */
|
||||
protected byte encodeNormValue(float boost, int fieldLength) {
|
||||
return SmallFloat.floatToByte315(boost / (float) Math.sqrt(fieldLength));
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1 / f<sup>2</sup></code>
|
||||
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
|
||||
protected float decodeNormValue(byte b) {
|
||||
return NORM_TABLE[b & 0xFF];
|
||||
}
|
||||
|
||||
// Default true
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with 0 position increment) are
|
||||
* ignored when computing norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms. */
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/** @see #setDiscountOverlaps */
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final byte computeNorm(FieldInvertState state) {
|
||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
||||
return encodeNormValue(state.getBoost(), numTerms);
|
||||
}
|
||||
|
||||
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
|
||||
final int df = stats.docFreq();
|
||||
final int max = searcher.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
|
||||
}
|
||||
|
||||
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
|
||||
final int max = searcher.maxDoc();
|
||||
float idf = 0.0f;
|
||||
final Explanation exp = new Explanation();
|
||||
exp.setDescription("idf(), sum of:");
|
||||
for (final TermContext stat : stats ) {
|
||||
final int df = stat.docFreq();
|
||||
final float termIdf = idf(df, max);
|
||||
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
|
||||
idf += termIdf;
|
||||
}
|
||||
exp.setValue(idf);
|
||||
return exp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
|
||||
Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher);
|
||||
|
||||
float avgdl = avgFieldLength(searcher, fieldName);
|
||||
|
||||
// compute freq-independent part of bm25 equation across all norm values
|
||||
float cache[] = new float[256];
|
||||
for (int i = 0; i < cache.length; i++) {
|
||||
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
|
||||
}
|
||||
return new BM25Stats(idf, queryBoost, avgdl, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
|
||||
final byte[] norms = context.reader.norms(fieldName);
|
||||
return norms == null
|
||||
? new ExactBM25DocScorerNoNorms((BM25Stats)stats)
|
||||
: new ExactBM25DocScorer((BM25Stats)stats, norms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
|
||||
return new SloppyBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName));
|
||||
}
|
||||
|
||||
private class ExactBM25DocScorer extends ExactDocScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue;
|
||||
private final byte[] norms;
|
||||
private final float[] cache;
|
||||
|
||||
ExactBM25DocScorer(BM25Stats stats, byte norms[]) {
|
||||
assert norms != null;
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
|
||||
this.cache = stats.cache;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, int freq) {
|
||||
return weightValue * freq / (freq + cache[norms[doc] & 0xFF]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
}
|
||||
}
|
||||
|
||||
/** there are no norms, we act as if b=0 */
|
||||
private class ExactBM25DocScorerNoNorms extends ExactDocScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue;
|
||||
private static final int SCORE_CACHE_SIZE = 32;
|
||||
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
|
||||
|
||||
ExactBM25DocScorerNoNorms(BM25Stats stats) {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
|
||||
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
|
||||
scoreCache[i] = weightValue * i / (i + k1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, int freq) {
|
||||
// TODO: maybe score cache is more trouble than its worth?
|
||||
return freq < SCORE_CACHE_SIZE // check cache
|
||||
? scoreCache[freq] // cache hit
|
||||
: weightValue * freq / (freq + k1); // cache miss
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return explainScore(doc, freq, stats, null);
|
||||
}
|
||||
}
|
||||
|
||||
private class SloppyBM25DocScorer extends SloppyDocScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue; // boost * idf * (k1 + 1)
|
||||
private final byte[] norms;
|
||||
private final float[] cache;
|
||||
|
||||
SloppyBM25DocScorer(BM25Stats stats, byte norms[]) {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1);
|
||||
this.cache = stats.cache;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) {
|
||||
// if there are no norms, we act as if b=0
|
||||
float norm = norms == null ? k1 : cache[norms[doc] & 0xFF];
|
||||
return weightValue * freq / (freq + norm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return sloppyFreq(distance);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return scorePayload(doc, start, end, payload);
|
||||
}
|
||||
}
|
||||
|
||||
/** Collection statistics for the BM25 model. */
|
||||
private static class BM25Stats extends Stats {
|
||||
/** BM25's idf */
|
||||
private final Explanation idf;
|
||||
/** The average document length. */
|
||||
private final float avgdl;
|
||||
/** query's inner boost */
|
||||
private final float queryBoost;
|
||||
/** weight (idf * boost) */
|
||||
private float weight;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float cache[];
|
||||
|
||||
BM25Stats(Explanation idf, float queryBoost, float avgdl, float cache[]) {
|
||||
this.idf = idf;
|
||||
this.queryBoost = queryBoost;
|
||||
this.avgdl = avgdl;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getValueForNormalization() {
|
||||
// we return a TF-IDF like normalization to be nice, but we don't actually normalize ourselves.
|
||||
final float queryWeight = idf.getValue() * queryBoost;
|
||||
return queryWeight * queryWeight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float queryNorm, float topLevelBoost) {
|
||||
// we don't normalize with queryNorm at all, we just capture the top-level boost
|
||||
this.weight = idf.getValue() * queryBoost * topLevelBoost;
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, byte[] norms) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
|
||||
|
||||
Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
|
||||
if (stats.queryBoost != 1.0f)
|
||||
result.addDetail(boostExpl);
|
||||
|
||||
result.addDetail(stats.idf);
|
||||
|
||||
Explanation tfNormExpl = new Explanation();
|
||||
tfNormExpl.setDescription("tfNorm, computed from:");
|
||||
tfNormExpl.addDetail(freq);
|
||||
tfNormExpl.addDetail(new Explanation(k1, "parameter k1"));
|
||||
if (norms == null) {
|
||||
tfNormExpl.addDetail(new Explanation(0, "parameter b (norms omitted for field)"));
|
||||
tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1));
|
||||
} else {
|
||||
float doclen = decodeNormValue(norms[doc]);
|
||||
tfNormExpl.addDetail(new Explanation(b, "parameter b"));
|
||||
tfNormExpl.addDetail(new Explanation(stats.avgdl, "avgFieldLength"));
|
||||
tfNormExpl.addDetail(new Explanation(doclen, "fieldLength"));
|
||||
tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)));
|
||||
}
|
||||
result.addDetail(tfNormExpl);
|
||||
result.setValue(boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BM25(k1=" + k1 + ",b=" + b + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* This class acts as the base class for the specific <em>basic model</em>
|
||||
* implementations in the DFR framework. Basic models compute the
|
||||
* <em>informative content Inf<sub>1</sub> = -log<sub>2</sub>Prob<sub>1</sub>
|
||||
* </em>.
|
||||
*
|
||||
* @see DFRSimilarity
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class BasicModel {
|
||||
/** Returns the informative content score. */
|
||||
public abstract float score(BasicStats stats, float tfn);
|
||||
|
||||
/**
|
||||
* Returns an explanation for the score.
|
||||
* <p>Most basic models use the number of documents and the total term
|
||||
* frequency to compute Inf<sub>1</sub>. This method provides a generic
|
||||
* explanation for such models. Subclasses that use other statistics must
|
||||
* override this method.</p>
|
||||
*/
|
||||
public Explanation explain(BasicStats stats, float tfn) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(score(stats, tfn));
|
||||
result.addDetail(new Explanation(tfn, "tfn"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the code of the
|
||||
* basic model formula. Refer to the original paper for the list.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Limiting form of the Bose-Einstein model. The formula used in Lucene differs
|
||||
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
|
||||
* and {@code N} is increased by {@code F}
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelBE extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
double F = stats.getTotalTermFreq() + tfn;
|
||||
// approximation only holds true when F << N, so we use N += F
|
||||
double N = F + stats.getNumberOfDocuments();
|
||||
return (float)(-log2((N - 1) * Math.E)
|
||||
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
|
||||
}
|
||||
|
||||
/** The <em>f</em> helper function defined for <em>B<sub>E</sub></em>. */
|
||||
private final double f(double n, double m) {
|
||||
return (m + 0.5) * log2(n / m) + (n - m) * log2(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Be";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Implements the approximation of the binomial model with the divergence
|
||||
* for DFR. The formula used in Lucene differs slightly from the one in the
|
||||
* original paper: to avoid underflow for small values of {@code N} and
|
||||
* {@code F}, {@code N} is increased by {@code 1} and
|
||||
* {@code F} is always increased by {@code tfn}.
|
||||
* <p>
|
||||
* WARNING: for terms that do not meet the expected random distribution
|
||||
* (e.g. stopwords), this model may give poor performance, such as
|
||||
* abnormally high scores for low tf values.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelD extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
|
||||
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
|
||||
// to create a 'normalized' F.
|
||||
double F = stats.getTotalTermFreq() + tfn;
|
||||
double phi = (double)tfn / F;
|
||||
double nphi = 1 - phi;
|
||||
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
|
||||
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
|
||||
return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "D";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs
|
||||
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
|
||||
* and {@code N} is increased by {@code F}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelG extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
|
||||
double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq());
|
||||
// -log(1 / (lambda + 1)) -> log(lambda + 1)
|
||||
return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "G";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* An approximation of the <em>I(n<sub>e</sub>)</em> model.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelIF extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "I(F)";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* The basic tf-idf model of randomness.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelIn extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
int n = stats.getDocFreq();
|
||||
return tfn * (float)(log2((N + 1) / (n + 0.5)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tfn) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(score(stats, tfn));
|
||||
result.addDetail(new Explanation(tfn, "tfn"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getDocFreq(), "docFreq"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "I(n)";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Tf-idf model of randomness, based on a mixture of Poisson and inverse
|
||||
* document frequency.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelIne extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
|
||||
return tfn * (float)(log2((N + 1) / (ne + 0.5)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "I(ne)";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Implements the Poisson approximation for the binomial model for DFR.
|
||||
* @lucene.experimental
|
||||
* <p>
|
||||
* WARNING: for terms that do not meet the expected random distribution
|
||||
* (e.g. stopwords), this model may give poor performance, such as
|
||||
* abnormally high scores for low tf values.
|
||||
*/
|
||||
public class BasicModelP extends BasicModel {
|
||||
/** {@code log2(Math.E)}, precomputed. */
|
||||
protected static double LOG2_E = log2(Math.E);
|
||||
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
|
||||
return (float)(tfn * log2(tfn / lambda)
|
||||
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
|
||||
+ 0.5 * log2(2 * Math.PI * tfn));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "P";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A simple {@link Similarity} provider that returns in
|
||||
* {@code get(String field)} the object passed to its constructor. This class
|
||||
* is aimed at non-VSM models, and therefore both the {@link #coord} and
|
||||
* {@link #queryNorm} methods return {@code 1}. Use
|
||||
* {@link DefaultSimilarityProvider} for {@link DefaultSimilarity}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicSimilarityProvider implements SimilarityProvider {
|
||||
private final Similarity sim;
|
||||
|
||||
public BasicSimilarityProvider(Similarity sim) {
|
||||
this.sim = sim;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity get(String field) {
|
||||
return sim;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BasicSimilarityProvider(" + sim + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
||||
/**
|
||||
* Stores all statistics commonly used ranking methods.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicStats extends Similarity.Stats {
|
||||
/** The number of documents. */
|
||||
protected int numberOfDocuments;
|
||||
/** The total number of tokens in the field. */
|
||||
protected long numberOfFieldTokens;
|
||||
/** The average field length. */
|
||||
protected float avgFieldLength;
|
||||
/** The document frequency. */
|
||||
protected int docFreq;
|
||||
/** The total number of occurrences of this term across all documents. */
|
||||
protected long totalTermFreq;
|
||||
|
||||
// -------------------------- Boost-related stuff --------------------------
|
||||
|
||||
/** Query's inner boost. */
|
||||
protected final float queryBoost;
|
||||
/** Any outer query's boost. */
|
||||
protected float topLevelBoost;
|
||||
/** For most Similarities, the immediate and the top level query boosts are
|
||||
* not handled differently. Hence, this field is just the product of the
|
||||
* other two. */
|
||||
protected float totalBoost;
|
||||
|
||||
/** Constructor. Sets the query boost. */
|
||||
public BasicStats(float queryBoost) {
|
||||
this.queryBoost = queryBoost;
|
||||
this.totalBoost = queryBoost;
|
||||
}
|
||||
|
||||
// ------------------------- Getter/setter methods -------------------------
|
||||
|
||||
/** Returns the number of documents. */
|
||||
public int getNumberOfDocuments() {
|
||||
return numberOfDocuments;
|
||||
}
|
||||
|
||||
/** Sets the number of documents. */
|
||||
public void setNumberOfDocuments(int numberOfDocuments) {
|
||||
this.numberOfDocuments = numberOfDocuments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of tokens in the field.
|
||||
* @see Terms#getSumTotalTermFreq()
|
||||
*/
|
||||
public long getNumberOfFieldTokens() {
|
||||
return numberOfFieldTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the total number of tokens in the field.
|
||||
* @see Terms#getSumTotalTermFreq()
|
||||
*/
|
||||
public void setNumberOfFieldTokens(long numberOfFieldTokens) {
|
||||
this.numberOfFieldTokens = numberOfFieldTokens;
|
||||
}
|
||||
|
||||
/** Returns the average field length. */
|
||||
public float getAvgFieldLength() {
|
||||
return avgFieldLength;
|
||||
}
|
||||
|
||||
/** Sets the average field length. */
|
||||
public void setAvgFieldLength(float avgFieldLength) {
|
||||
this.avgFieldLength = avgFieldLength;
|
||||
}
|
||||
|
||||
/** Returns the document frequency. */
|
||||
public int getDocFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
/** Sets the document frequency. */
|
||||
public void setDocFreq(int docFreq) {
|
||||
this.docFreq = docFreq;
|
||||
}
|
||||
|
||||
/** Returns the total number of occurrences of this term across all documents. */
|
||||
public long getTotalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
/** Sets the total number of occurrences of this term across all documents. */
|
||||
public void setTotalTermFreq(long totalTermFreq) {
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
|
||||
// -------------------------- Boost-related stuff --------------------------
|
||||
|
||||
/** The square of the raw normalization value.
|
||||
* @see #rawNormalizationValue() */
|
||||
@Override
|
||||
public float getValueForNormalization() {
|
||||
float rawValue = rawNormalizationValue();
|
||||
return rawValue * rawValue;
|
||||
}
|
||||
|
||||
/** Computes the raw normalization value. This basic implementation returns
|
||||
* the query boost. Subclasses may override this method to include other
|
||||
* factors (such as idf), or to save the value for inclusion in
|
||||
* {@link #normalize(float, float)}, etc.
|
||||
*/
|
||||
protected float rawNormalizationValue() {
|
||||
return queryBoost;
|
||||
}
|
||||
|
||||
/** No normalization is done. {@code topLevelBoost} is saved in the object,
|
||||
* however. */
|
||||
@Override
|
||||
public void normalize(float queryNorm, float topLevelBoost) {
|
||||
this.topLevelBoost = topLevelBoost;
|
||||
totalBoost = queryBoost * topLevelBoost;
|
||||
}
|
||||
|
||||
/** Returns the total boost. */
|
||||
public float getTotalBoost() {
|
||||
return totalBoost;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Implements the <em>divergence from randomness (DFR)</em> framework
|
||||
* introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002.
|
||||
* Probabilistic models of information retrieval based on measuring the
|
||||
* divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002),
|
||||
* 357-389.
|
||||
* <p>The DFR scoring formula is composed of three separate components: the
|
||||
* <em>basic model</em>, the <em>aftereffect</em> and an additional
|
||||
* <em>normalization</em> component, represented by the classes
|
||||
* {@code BasicModel}, {@code AfterEffect} and {@code Normalization},
|
||||
* respectively. The names of these classes were chosen to match the names of
|
||||
* their counterparts in the Terrier IR engine.</p>
|
||||
* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
|
||||
* is not handled by this implementation.</p>
|
||||
* @see BasicModel
|
||||
* @see AfterEffect
|
||||
* @see Normalization
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DFRSimilarity extends SimilarityBase {
|
||||
/** The basic model for information content. */
|
||||
protected final BasicModel basicModel;
|
||||
/** The first normalization of the information content. */
|
||||
protected final AfterEffect afterEffect;
|
||||
/** The term frequency normalization. */
|
||||
protected final Normalization normalization;
|
||||
|
||||
public DFRSimilarity(BasicModel basicModel,
|
||||
AfterEffect afterEffect,
|
||||
Normalization normalization) {
|
||||
if (basicModel == null || afterEffect == null || normalization == null) {
|
||||
throw new NullPointerException("null parameters not allowed.");
|
||||
}
|
||||
this.basicModel = basicModel;
|
||||
this.afterEffect = afterEffect;
|
||||
this.normalization = normalization;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
float tfn = normalization.tfn(stats, freq, docLen);
|
||||
return stats.getTotalBoost() *
|
||||
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(Explanation expl,
|
||||
BasicStats stats, int doc, float freq, float docLen) {
|
||||
if (stats.getTotalBoost() != 1.0f) {
|
||||
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
|
||||
}
|
||||
|
||||
Explanation normExpl = normalization.explain(stats, freq, docLen);
|
||||
float tfn = normExpl.getValue();
|
||||
expl.addDetail(normExpl);
|
||||
expl.addDetail(basicModel.explain(stats, tfn));
|
||||
expl.addDetail(afterEffect.explain(stats, tfn));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DFR " + basicModel.toString() + afterEffect.toString()
|
||||
+ normalization.toString();
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -85,4 +85,9 @@ public class DefaultSimilarity extends TFIDFSimilarity {
|
|||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DefaultSimilarity";
|
||||
}
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -0,0 +1,45 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* The probabilistic distribution used to model term occurrence
|
||||
* in information-based models.
|
||||
* @see IBSimilarity
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Distribution {
|
||||
/** Computes the score. */
|
||||
public abstract float score(BasicStats stats, float tfn, float lambda);
|
||||
|
||||
/** Explains the score. Returns the name of the model only, since
|
||||
* both {@code tfn} and {@code lambda} are explained elsewhere. */
|
||||
public Explanation explain(BasicStats stats, float tfn, float lambda) {
|
||||
return new Explanation(
|
||||
score(stats, tfn, lambda), getClass().getSimpleName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the name of the
|
||||
* distribution.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Log-logistic distribution.
|
||||
* <p>Unlike for DFR, the natural logarithm is used, as
|
||||
* it is faster to compute and the original paper does not express any
|
||||
* preference to a specific base.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DistributionLL extends Distribution {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn, float lambda) {
|
||||
return (float)-Math.log(lambda / (tfn + lambda));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "LL";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* The smoothed power-law (SPL) distribution for the information-based framework
|
||||
* that is described in the original paper.
|
||||
* <p>Unlike for DFR, the natural logarithm is used, as
|
||||
* it is faster to compute and the original paper does not express any
|
||||
* preference to a specific base.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DistributionSPL extends Distribution {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn, float lambda) {
|
||||
if (lambda == 1f) {
|
||||
lambda = 0.99f;
|
||||
}
|
||||
return (float)-Math.log(
|
||||
(Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SPL";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Provides a framework for the family of information-based models, as described
|
||||
* in Stéphane Clinchant and Eric Gaussier. 2010. Information-based
|
||||
* models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR
|
||||
* conference on Research and development in information retrieval (SIGIR '10).
|
||||
* ACM, New York, NY, USA, 234-241.
|
||||
* <p>The retrieval function is of the form <em>RSV(q, d) = ∑
|
||||
* -x<sup>q</sup><sub>w</sub> log Prob(X<sub>w</sub> ≥
|
||||
* t<sup>d</sup><sub>w</sub> | λ<sub>w</sub>)</em>, where
|
||||
* <ul>
|
||||
* <li><em>x<sup>q</sup><sub>w</sub></em> is the query boost;</li>
|
||||
* <li><em>X<sub>w</sub></em> is a random variable that counts the occurrences
|
||||
* of word <em>w</em>;</li>
|
||||
* <li><em>t<sup>d</sup><sub>w</sub></em> is the normalized term frequency;</li>
|
||||
* <li><em>λ<sub>w</sub></em> is a parameter.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>The framework described in the paper has many similarities to the DFR
|
||||
* framework (see {@link DFRSimilarity}). It is possible that the two
|
||||
* Similarities will be merged at one point.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class IBSimilarity extends SimilarityBase {
|
||||
/** The probabilistic distribution used to model term occurrence. */
|
||||
protected final Distribution distribution;
|
||||
/** The <em>lambda (λ<sub>w</sub>)</em> parameter. */
|
||||
protected final Lambda lambda;
|
||||
/** The term frequency normalization. */
|
||||
protected final Normalization normalization;
|
||||
|
||||
public IBSimilarity(Distribution distribution,
|
||||
Lambda lambda,
|
||||
Normalization normalization) {
|
||||
this.distribution = distribution;
|
||||
this.lambda = lambda;
|
||||
this.normalization = normalization;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
return stats.getTotalBoost() *
|
||||
distribution.score(
|
||||
stats,
|
||||
normalization.tfn(stats, freq, docLen),
|
||||
lambda.lambda(stats));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(
|
||||
Explanation expl, BasicStats stats, int doc, float freq, float docLen) {
|
||||
if (stats.getTotalBoost() != 1.0f) {
|
||||
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
|
||||
}
|
||||
Explanation normExpl = normalization.explain(stats, freq, docLen);
|
||||
Explanation lambdaExpl = lambda.explain(stats);
|
||||
expl.addDetail(normExpl);
|
||||
expl.addDetail(lambdaExpl);
|
||||
expl.addDetail(distribution.explain(
|
||||
stats, normExpl.getValue(), lambdaExpl.getValue()));
|
||||
}
|
||||
|
||||
/**
|
||||
* The name of IB methods follow the pattern
|
||||
* {@code IB <distribution> <lambda><normalization>}. The name of the
|
||||
* distribution is the same as in the original paper; for the names of lambda
|
||||
* parameters, refer to the javadoc of the {@link Lambda} classes.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "IB " + distribution.toString() + "-" + lambda.toString()
|
||||
+ normalization.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Bayesian smoothing using Dirichlet priors. From Chengxiang Zhai and John
|
||||
* Lafferty. 2001. A study of smoothing methods for language models applied to
|
||||
* Ad Hoc information retrieval. In Proceedings of the 24th annual international
|
||||
* ACM SIGIR conference on Research and development in information retrieval
|
||||
* (SIGIR '01). ACM, New York, NY, USA, 334-342.
|
||||
* <p>
|
||||
* The formula as defined the paper assigns a negative score to documents that
|
||||
* contain the term, but with fewer occurrences than predicted by the collection
|
||||
* language model. The Lucene implementation returns {@code 0} for such
|
||||
* documents.
|
||||
* </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LMDirichletSimilarity extends LMSimilarity {
|
||||
/** The μ parameter. */
|
||||
private final float mu;
|
||||
|
||||
/** @param mu the μ parameter. */
|
||||
public LMDirichletSimilarity(CollectionModel collectionModel, float mu) {
|
||||
super(collectionModel);
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
/** @param mu the μ parameter. */
|
||||
public LMDirichletSimilarity(float mu) {
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the default μ value of 2000. */
|
||||
public LMDirichletSimilarity(CollectionModel collectionModel) {
|
||||
this(collectionModel, 2000);
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the default μ value of 2000. */
|
||||
public LMDirichletSimilarity() {
|
||||
this(2000);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
float score = stats.getTotalBoost() * (float)(Math.log(1 + freq /
|
||||
(mu * ((LMStats)stats).getCollectionProbability())) +
|
||||
Math.log(mu / (docLen + mu)));
|
||||
return score > 0.0f ? score : 0.0f;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(Explanation expl, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
if (stats.getTotalBoost() != 1.0f) {
|
||||
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
|
||||
}
|
||||
|
||||
expl.addDetail(new Explanation(mu, "mu"));
|
||||
Explanation weightExpl = new Explanation();
|
||||
weightExpl.setValue((float)Math.log(1 + freq /
|
||||
(mu * ((LMStats)stats).getCollectionProbability())));
|
||||
weightExpl.setDescription("term weight");
|
||||
expl.addDetail(weightExpl);
|
||||
expl.addDetail(new Explanation(
|
||||
(float)Math.log(mu / (docLen + mu)), "document norm"));
|
||||
super.explain(expl, stats, doc, freq, docLen);
|
||||
}
|
||||
|
||||
/** Returns the μ parameter. */
|
||||
public float getMu() {
|
||||
return mu;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return String.format("Dirichlet(%f)", getMu());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Language model based on the Jelinek-Mercer smoothing method. From Chengxiang
|
||||
* Zhai and John Lafferty. 2001. A study of smoothing methods for language
|
||||
* models applied to Ad Hoc information retrieval. In Proceedings of the 24th
|
||||
* annual international ACM SIGIR conference on Research and development in
|
||||
* information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342.
|
||||
* <p>The model has a single parameter, λ. According to said paper, the
|
||||
* optimal value depends on both the collection and the query. The optimal value
|
||||
* is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LMJelinekMercerSimilarity extends LMSimilarity {
|
||||
/** The λ parameter. */
|
||||
private final float lambda;
|
||||
|
||||
/** @param lambda the λ parameter. */
|
||||
public LMJelinekMercerSimilarity(
|
||||
CollectionModel collectionModel, float lambda) {
|
||||
super(collectionModel);
|
||||
this.lambda = lambda;
|
||||
}
|
||||
|
||||
/** @param lambda the λ parameter. */
|
||||
public LMJelinekMercerSimilarity(float lambda) {
|
||||
this.lambda = lambda;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
return stats.getTotalBoost() *
|
||||
(float)Math.log(1 +
|
||||
((1 - lambda) * freq / docLen) /
|
||||
(lambda * ((LMStats)stats).getCollectionProbability()));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(Explanation expl, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
if (stats.getTotalBoost() != 1.0f) {
|
||||
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
|
||||
}
|
||||
expl.addDetail(new Explanation(lambda, "lambda"));
|
||||
super.explain(expl, stats, doc, freq, docLen);
|
||||
}
|
||||
|
||||
/** Returns the λ parameter. */
|
||||
public float getLambda() {
|
||||
return lambda;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return String.format("Jelinek-Mercer(%f)", getLambda());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
/**
|
||||
* Abstract superclass for language modeling Similarities. The following inner
|
||||
* types are introduced:
|
||||
* <ul>
|
||||
* <li>{@link LMStats}, which defines a new statistic, the probability that
|
||||
* the collection language model generates the current term;</li>
|
||||
* <li>{@link CollectionModel}, which is a strategy interface for object that
|
||||
* compute the collection language model {@code p(w|C)};</li>
|
||||
* <li>{@link DefaultCollectionModel}, an implementation of the former, that
|
||||
* computes the term probability as the number of occurrences of the term in the
|
||||
* collection, divided by the total number of tokens.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class LMSimilarity extends SimilarityBase {
|
||||
/** The collection model. */
|
||||
protected final CollectionModel collectionModel;
|
||||
|
||||
/** Creates a new instance with the specified collection language model. */
|
||||
public LMSimilarity(CollectionModel collectionModel) {
|
||||
this.collectionModel = collectionModel;
|
||||
}
|
||||
|
||||
/** Creates a new instance with the default collection language model. */
|
||||
public LMSimilarity() {
|
||||
this(new DefaultCollectionModel());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BasicStats newStats(float queryBoost) {
|
||||
return new LMStats(queryBoost);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the collection probability of the current term in addition to the
|
||||
* usual statistics.
|
||||
*/
|
||||
@Override
|
||||
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, String fieldName, TermContext termContext) throws IOException {
|
||||
super.fillBasicStats(stats, searcher, fieldName, termContext);
|
||||
LMStats lmStats = (LMStats) stats;
|
||||
lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(Explanation expl, BasicStats stats, int doc,
|
||||
float freq, float docLen) {
|
||||
expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
|
||||
"collection probability"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name of the LM method. The values of the parameters should be
|
||||
* included as well.
|
||||
* <p>Used in {@link #toString()}</p>.
|
||||
*/
|
||||
public abstract String getName();
|
||||
|
||||
/**
|
||||
* Returns the name of the LM method. If a custom collection model strategy is
|
||||
* used, its name is included as well.
|
||||
* @see #getName()
|
||||
* @see CollectionModel#getName()
|
||||
* @see DefaultCollectionModel
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
String coll = collectionModel.getName();
|
||||
if (coll != null) {
|
||||
return String.format("LM %s - %s", getName(), coll);
|
||||
} else {
|
||||
return String.format("LM %s", getName());
|
||||
}
|
||||
}
|
||||
|
||||
/** Stores the collection distribution of the current term. */
|
||||
public static class LMStats extends BasicStats {
|
||||
/** The probability that the current term is generated by the collection. */
|
||||
private float collectionProbability;
|
||||
|
||||
public LMStats(float queryBoost) {
|
||||
super(queryBoost);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the probability that the current term is generated by the
|
||||
* collection.
|
||||
*/
|
||||
public final float getCollectionProbability() {
|
||||
return collectionProbability;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the probability that the current term is generated by the
|
||||
* collection.
|
||||
*/
|
||||
public final void setCollectionProbability(float collectionProbability) {
|
||||
this.collectionProbability = collectionProbability;
|
||||
}
|
||||
}
|
||||
|
||||
/** A strategy for computing the collection language model. */
|
||||
public static interface CollectionModel {
|
||||
/**
|
||||
* Computes the probability {@code p(w|C)} according to the language model
|
||||
* strategy for the current term.
|
||||
*/
|
||||
public float computeProbability(BasicStats stats);
|
||||
|
||||
/** The name of the collection model strategy. */
|
||||
public String getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Models {@code p(w|C)} as the number of occurrences of the term in the
|
||||
* collection, divided by the total number of tokens {@code + 1}.
|
||||
*/
|
||||
public static class DefaultCollectionModel implements CollectionModel {
|
||||
@Override
|
||||
public float computeProbability(BasicStats stats) {
|
||||
return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* The <em>lambda (λ<sub>w</sub>)</em> parameter in information-based
|
||||
* models.
|
||||
* @see IBSimilarity
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Lambda {
|
||||
/** Computes the lambda parameter. */
|
||||
public abstract float lambda(BasicStats stats);
|
||||
/** Explains the lambda parameter. */
|
||||
public abstract Explanation explain(BasicStats stats);
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the code of the lambda
|
||||
* formula. Since the original paper is not very clear on this matter, and
|
||||
* also uses the DFR naming scheme incorrectly, the codes here were chosen
|
||||
* arbitrarily.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Computes lambda as {@code totalTermFreq / numberOfDocuments}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LambdaDF extends Lambda {
|
||||
@Override
|
||||
public final float lambda(BasicStats stats) {
|
||||
return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(lambda(stats));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getDocFreq(), "docFreq"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "D";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Computes lambda as {@code docFreq / numberOfDocuments}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LambdaTTF extends Lambda {
|
||||
@Override
|
||||
public final float lambda(BasicStats stats) {
|
||||
return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(lambda(stats));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "L";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
/**
|
||||
* Implements the CombSUM method for combining evidence from multiple
|
||||
* similarity values described in: Joseph A. Shaw, Edward A. Fox.
|
||||
* In Text REtrieval Conference (1993), pp. 243-252
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class MultiSimilarity extends Similarity {
|
||||
protected final Similarity sims[];
|
||||
|
||||
public MultiSimilarity(Similarity sims[]) {
|
||||
this.sims = sims;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte computeNorm(FieldInvertState state) {
|
||||
return sims[0].computeNorm(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
|
||||
Stats subStats[] = new Stats[sims.length];
|
||||
for (int i = 0; i < subStats.length; i++) {
|
||||
subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts);
|
||||
}
|
||||
return new MultiStats(subStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
|
||||
ExactDocScorer subScorers[] = new ExactDocScorer[sims.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
subScorers[i] = sims[i].exactDocScorer(((MultiStats)stats).subStats[i], fieldName, context);
|
||||
}
|
||||
return new MultiExactDocScorer(subScorers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
|
||||
SloppyDocScorer subScorers[] = new SloppyDocScorer[sims.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
subScorers[i] = sims[i].sloppyDocScorer(((MultiStats)stats).subStats[i], fieldName, context);
|
||||
}
|
||||
return new MultiSloppyDocScorer(subScorers);
|
||||
}
|
||||
|
||||
public static class MultiExactDocScorer extends ExactDocScorer {
|
||||
private final ExactDocScorer subScorers[];
|
||||
|
||||
MultiExactDocScorer(ExactDocScorer subScorers[]) {
|
||||
this.subScorers = subScorers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, int freq) {
|
||||
float sum = 0.0f;
|
||||
for (ExactDocScorer subScorer : subScorers) {
|
||||
sum += subScorer.score(doc, freq);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:");
|
||||
for (ExactDocScorer subScorer : subScorers) {
|
||||
expl.addDetail(subScorer.explain(doc, freq));
|
||||
}
|
||||
return expl;
|
||||
}
|
||||
}
|
||||
|
||||
public static class MultiSloppyDocScorer extends SloppyDocScorer {
|
||||
private final SloppyDocScorer subScorers[];
|
||||
|
||||
MultiSloppyDocScorer(SloppyDocScorer subScorers[]) {
|
||||
this.subScorers = subScorers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) {
|
||||
float sum = 0.0f;
|
||||
for (SloppyDocScorer subScorer : subScorers) {
|
||||
sum += subScorer.score(doc, freq);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:");
|
||||
for (SloppyDocScorer subScorer : subScorers) {
|
||||
expl.addDetail(subScorer.explain(doc, freq));
|
||||
}
|
||||
return expl;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return subScorers[0].computeSlopFactor(distance);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return subScorers[0].computePayloadFactor(doc, start, end, payload);
|
||||
}
|
||||
}
|
||||
|
||||
public static class MultiStats extends Stats {
|
||||
final Stats subStats[];
|
||||
|
||||
MultiStats(Stats subStats[]) {
|
||||
this.subStats = subStats;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getValueForNormalization() {
|
||||
float sum = 0.0f;
|
||||
for (Stats stat : subStats) {
|
||||
sum += stat.getValueForNormalization();
|
||||
}
|
||||
return sum / subStats.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float queryNorm, float topLevelBoost) {
|
||||
for (Stats stat : subStats) {
|
||||
stat.normalize(queryNorm, topLevelBoost);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* This class acts as the base class for the implementations of the term
|
||||
* frequency normalization methods in the DFR framework.
|
||||
*
|
||||
* @see DFRSimilarity
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Normalization {
|
||||
/** Returns the normalized term frequency.
|
||||
* @param len the field length. */
|
||||
public abstract float tfn(BasicStats stats, float tf, float len);
|
||||
|
||||
/** Returns an explanation for the normalized term frequency.
|
||||
* <p>The default normalization methods use the field length of the document
|
||||
* and the average field length to compute the normalized term frequency.
|
||||
* This method provides a generic explanation for such methods.
|
||||
* Subclasses that use other statistics must override this method.</p>
|
||||
*/
|
||||
public Explanation explain(BasicStats stats, float tf, float len) {
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ", computed from: ");
|
||||
result.setValue(tfn(stats, tf, len));
|
||||
result.addDetail(new Explanation(tf, "tf"));
|
||||
result.addDetail(
|
||||
new Explanation(stats.getAvgFieldLength(), "avgFieldLength"));
|
||||
result.addDetail(new Explanation(len, "len"));
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Implementation used when there is no normalization. */
|
||||
public static final class NoNormalization extends Normalization {
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return tf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, float tf, float len) {
|
||||
return new Explanation(1, "no normalization");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the code of the
|
||||
* normalization formula. Refer to the original paper for the list.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalization model that assumes a uniform distribution of the term frequency.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationH1 extends Normalization {
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return tf * stats.getAvgFieldLength() / len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "1";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Normalization model in which the term frequency is inversely related to the
|
||||
* length.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationH2 extends Normalization {
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "2";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Dirichlet Priors normalization
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationH3 extends Normalization {
|
||||
private final float mu;
|
||||
|
||||
public NormalizationH3() {
|
||||
this(800F);
|
||||
}
|
||||
|
||||
public NormalizationH3(float mu) {
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tfn(BasicStats stats, float tf, float len) {
|
||||
return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "3(" + mu + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Pareto-Zipf Normalization
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationZ extends Normalization {
|
||||
final float z;
|
||||
|
||||
public NormalizationZ() {
|
||||
this(0.30F);
|
||||
}
|
||||
|
||||
public NormalizationZ(float z) {
|
||||
this.z = z;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float tfn(BasicStats stats, float tf, float len) {
|
||||
return (float)(tf * Math.pow(stats.avgFieldLength / len, z));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Z(" + z + ")";
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -25,6 +25,12 @@ import org.apache.lucene.index.FieldInvertState;
|
|||
import org.apache.lucene.index.IndexReader; // javadoc
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Terms; // javadoc
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery; // javadoc
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat; // javadoc
|
||||
|
@ -140,7 +146,7 @@ public abstract class Similarity {
|
|||
* <p>
|
||||
* Term frequencies are integers (the term or phrase's tf)
|
||||
*/
|
||||
public abstract class ExactDocScorer {
|
||||
public static abstract class ExactDocScorer {
|
||||
/**
|
||||
* Score a single document
|
||||
* @param doc document id
|
||||
|
@ -169,7 +175,7 @@ public abstract class Similarity {
|
|||
* <p>
|
||||
* Term frequencies are floating point values.
|
||||
*/
|
||||
public abstract class SloppyDocScorer {
|
||||
public static abstract class SloppyDocScorer {
|
||||
/**
|
||||
* Score a single document
|
||||
* @param doc document id
|
|
@ -0,0 +1,345 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
/**
|
||||
* A subclass of {@code Similarity} that provides a simplified API for its
|
||||
* descendants. Subclasses are only required to implement the {@link #score}
|
||||
* and {@link #toString()} methods. Implementing
|
||||
* {@link #explain(Explanation, BasicStats, int, float, float)} is optional,
|
||||
* inasmuch as SimilarityBase already provides a basic explanation of the score
|
||||
* and the term frequency. However, implementers of a subclass are encouraged to
|
||||
* include as much detail about the scoring method as possible.
|
||||
* <p>
|
||||
* Note: multi-word queries such as phrase queries are scored in a different way
|
||||
* than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
|
||||
* the phrase as a whole (since it does not know it), this class instead scores
|
||||
* phrases as a summation of the individual term scores.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class SimilarityBase extends Similarity {
|
||||
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
|
||||
private static final double LOG_2 = Math.log(2);
|
||||
|
||||
/** @see #setDiscountOverlaps */
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/** @see #setDiscountOverlaps */
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Stats computeStats(IndexSearcher searcher, String fieldName,
|
||||
float queryBoost, TermContext... termContexts) throws IOException {
|
||||
BasicStats stats[] = new BasicStats[termContexts.length];
|
||||
for (int i = 0; i < termContexts.length; i++) {
|
||||
stats[i] = newStats(queryBoost);
|
||||
fillBasicStats(stats[i], searcher, fieldName, termContexts[i]);
|
||||
}
|
||||
return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
|
||||
}
|
||||
|
||||
/** Factory method to return a custom stats object */
|
||||
protected BasicStats newStats(float queryBoost) {
|
||||
return new BasicStats(queryBoost);
|
||||
}
|
||||
|
||||
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
|
||||
* Subclasses can override this method to fill additional stats. */
|
||||
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher,
|
||||
String fieldName, TermContext termContext) throws IOException {
|
||||
IndexReader reader = searcher.getIndexReader();
|
||||
int numberOfDocuments = reader.maxDoc();
|
||||
|
||||
int docFreq = termContext.docFreq();
|
||||
long totalTermFreq = termContext.totalTermFreq();
|
||||
|
||||
// codec does not supply totalTermFreq: substitute docFreq
|
||||
if (totalTermFreq == -1) {
|
||||
totalTermFreq = docFreq;
|
||||
}
|
||||
|
||||
final long numberOfFieldTokens;
|
||||
final float avgFieldLength;
|
||||
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName);
|
||||
if (terms == null) {
|
||||
// field does not exist;
|
||||
numberOfFieldTokens = 0;
|
||||
avgFieldLength = 1;
|
||||
} else {
|
||||
long sumTotalTermFreq = terms.getSumTotalTermFreq();
|
||||
|
||||
// We have to provide something if codec doesnt supply these measures,
|
||||
// or if someone omitted frequencies for the field... negative values cause
|
||||
// NaN/Inf for some scorers.
|
||||
if (sumTotalTermFreq == -1) {
|
||||
numberOfFieldTokens = docFreq;
|
||||
avgFieldLength = 1;
|
||||
} else {
|
||||
numberOfFieldTokens = sumTotalTermFreq;
|
||||
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add sumDocFreq for field (numberOfFieldPostings)
|
||||
stats.setNumberOfDocuments(numberOfDocuments);
|
||||
stats.setNumberOfFieldTokens(numberOfFieldTokens);
|
||||
stats.setAvgFieldLength(avgFieldLength);
|
||||
stats.setDocFreq(docFreq);
|
||||
stats.setTotalTermFreq(totalTermFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scores the document {@code doc}.
|
||||
* <p>Subclasses must apply their scoring formula in this class.</p>
|
||||
* @param stats the corpus level statistics.
|
||||
* @param freq the term frequency.
|
||||
* @param docLen the document length.
|
||||
* @return the score.
|
||||
*/
|
||||
protected abstract float score(BasicStats stats, float freq, float docLen);
|
||||
|
||||
/**
|
||||
* Subclasses should implement this method to explain the score. {@code expl}
|
||||
* already contains the score, the name of the class and the doc id, as well
|
||||
* as the term frequency and its explanation; subclasses can add additional
|
||||
* clauses to explain details of their scoring formulae.
|
||||
* <p>The default implementation does nothing.</p>
|
||||
*
|
||||
* @param expl the explanation to extend with details.
|
||||
* @param stats the corpus level statistics.
|
||||
* @param doc the document id.
|
||||
* @param freq the term frequency.
|
||||
* @param docLen the document length.
|
||||
*/
|
||||
protected void explain(
|
||||
Explanation expl, BasicStats stats, int doc, float freq, float docLen) {}
|
||||
|
||||
/**
|
||||
* Explains the score. The implementation here provides a basic explanation
|
||||
* in the format <em>score(name-of-similarity, doc=doc-id,
|
||||
* freq=term-frequency), computed from:</em>, and
|
||||
* attaches the score (computed via the {@link #score(BasicStats, float, float)}
|
||||
* method) and the explanation for the term frequency. Subclasses content with
|
||||
* this format may add additional details in
|
||||
* {@link #explain(Explanation, BasicStats, int, float, float)}.
|
||||
*
|
||||
* @param stats the corpus level statistics.
|
||||
* @param doc the document id.
|
||||
* @param freq the term frequency and its explanation.
|
||||
* @param docLen the document length.
|
||||
* @return the explanation.
|
||||
*/
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, float docLen) {
|
||||
Explanation result = new Explanation();
|
||||
result.setValue(score(stats, freq.getValue(), docLen));
|
||||
result.setDescription("score(" + getClass().getSimpleName() +
|
||||
", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:");
|
||||
result.addDetail(freq);
|
||||
|
||||
explain(result, stats, doc, freq.getValue(), docLen);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExactDocScorer exactDocScorer(Stats stats, String fieldName,
|
||||
AtomicReaderContext context) throws IOException {
|
||||
byte norms[] = context.reader.norms(fieldName);
|
||||
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
|
||||
ExactDocScorer subScorers[] = new ExactDocScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
subScorers[i] = new BasicExactDocScorer((BasicStats)subStats[i], norms);
|
||||
}
|
||||
return new MultiSimilarity.MultiExactDocScorer(subScorers);
|
||||
} else {
|
||||
return new BasicExactDocScorer((BasicStats) stats, norms);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName,
|
||||
AtomicReaderContext context) throws IOException {
|
||||
byte norms[] = context.reader.norms(fieldName);
|
||||
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
|
||||
SloppyDocScorer subScorers[] = new SloppyDocScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
subScorers[i] = new BasicSloppyDocScorer((BasicStats)subStats[i], norms);
|
||||
}
|
||||
return new MultiSimilarity.MultiSloppyDocScorer(subScorers);
|
||||
} else {
|
||||
return new BasicSloppyDocScorer((BasicStats) stats, norms);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the name of the Similarity
|
||||
* and preferably the values of parameters (if any) as well.
|
||||
*/
|
||||
@Override
|
||||
public abstract String toString();
|
||||
|
||||
// ------------------------------ Norm handling ------------------------------
|
||||
|
||||
/** Norm -> document length map. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
||||
@Override
|
||||
public byte computeNorm(FieldInvertState state) {
|
||||
final float numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength() / state.getBoost();
|
||||
return encodeNormValue(state.getBoost(), numTerms);
|
||||
}
|
||||
|
||||
/** Decodes a normalization factor (document length) stored in an index.
|
||||
* @see #encodeNormValue(float,float)
|
||||
*/
|
||||
protected float decodeNormValue(byte norm) {
|
||||
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Encodes the length to a byte via SmallFloat. */
|
||||
protected byte encodeNormValue(float boost, float length) {
|
||||
return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length)));
|
||||
}
|
||||
|
||||
// ----------------------------- Static methods ------------------------------
|
||||
|
||||
/** Returns the base two logarithm of {@code x}. */
|
||||
public static double log2(double x) {
|
||||
// Put this to a 'util' class if we need more of these.
|
||||
return Math.log(x) / LOG_2;
|
||||
}
|
||||
|
||||
// --------------------------------- Classes ---------------------------------
|
||||
|
||||
/** Delegates the {@link #score(int, int)} and
|
||||
* {@link #explain(int, Explanation)} methods to
|
||||
* {@link SimilarityBase#score(BasicStats, float, int)} and
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, int)},
|
||||
* respectively.
|
||||
*/
|
||||
private class BasicExactDocScorer extends ExactDocScorer {
|
||||
private final BasicStats stats;
|
||||
private final byte[] norms;
|
||||
|
||||
BasicExactDocScorer(BasicStats stats, byte norms[]) {
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, int freq) {
|
||||
// We have to supply something in case norms are omitted
|
||||
return SimilarityBase.this.score(stats, freq,
|
||||
norms == null ? 1F : decodeNormValue(norms[doc]));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return SimilarityBase.this.explain(stats, doc, freq,
|
||||
norms == null ? 1F : decodeNormValue(norms[doc]));
|
||||
}
|
||||
}
|
||||
|
||||
/** Delegates the {@link #score(int, int)} and
|
||||
* {@link #explain(int, Explanation)} methods to
|
||||
* {@link SimilarityBase#score(BasicStats, float, int)} and
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, int)},
|
||||
* respectively.
|
||||
*/
|
||||
private class BasicSloppyDocScorer extends SloppyDocScorer {
|
||||
private final BasicStats stats;
|
||||
private final byte[] norms;
|
||||
|
||||
BasicSloppyDocScorer(BasicStats stats, byte norms[]) {
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) {
|
||||
// We have to supply something in case norms are omitted
|
||||
return SimilarityBase.this.score(stats, freq,
|
||||
norms == null ? 1F : decodeNormValue(norms[doc]));
|
||||
}
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) {
|
||||
return SimilarityBase.this.explain(stats, doc, freq,
|
||||
norms == null ? 1F : decodeNormValue(norms[doc]));
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return 1.0f / (distance + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return 1f;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,6 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -21,6 +21,10 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
@ -303,13 +307,13 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* two term-queries with that same term and hence the computation would still be correct (although
|
||||
* not very efficient).
|
||||
* The default computation for <i>tf(t in d)</i> in
|
||||
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) DefaultSimilarity} is:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" align="center">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} =
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) tf(t in d)} =
|
||||
* </td>
|
||||
* <td valign="top" align="center" rowspan="1">
|
||||
* frequency<sup><big>½</big></sup>
|
||||
|
@ -328,13 +332,13 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
|
||||
* hence it is squared in the equation.
|
||||
* The default computation for <i>idf(t)</i> in
|
||||
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" align="center">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right">
|
||||
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} =
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)} =
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* 1 + log <big>(</big>
|
||||
|
@ -376,14 +380,14 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* This is a search time factor computed by the Similarity in effect at search time.
|
||||
*
|
||||
* The default computation in
|
||||
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
|
||||
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
|
||||
* <br> <br>
|
||||
* <table cellpadding="1" cellspacing="0" border="0" align="center">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right" rowspan="1">
|
||||
* queryNorm(q) =
|
||||
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
|
||||
* =
|
||||
* </td>
|
||||
* <td valign="middle" align="center" rowspan="1">
|
|
@ -0,0 +1,174 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
This package contains the various ranking models that can be used in Lucene. The
|
||||
abstract class {@link org.apache.lucene.search.similarities.Similarity} serves
|
||||
as the base for ranking functions. For searching, users can employ the models
|
||||
already implemented or create their own by extending one of the classes in this
|
||||
package.
|
||||
|
||||
<h2>Table Of Contents</h2>
|
||||
<p>
|
||||
<ol>
|
||||
<li><a href="#sims">Summary of the Ranking Methods</a></li>
|
||||
<li><a href="#providers">Similarity Providers<a/></li>
|
||||
<li><a href="#changingSimilarity">Changing the Similarity</a></li>
|
||||
</ol>
|
||||
</p>
|
||||
|
||||
|
||||
<a name="sims"></a>
|
||||
<h2>Summary of the Ranking Methods</h2>
|
||||
|
||||
<p>{@link org.apache.lucene.search.similarities.DefaultSimilarity} is the original Lucene
|
||||
scoring function. It is based on a highly optimized Vector Space Model. For more
|
||||
information, see {@link org.apache.lucene.search.similarities.TFIDFSimilarity}.</p>
|
||||
|
||||
<p>{@link org.apache.lucene.search.similarities.BM25Similarity} is an optimized
|
||||
implementation of the successful Okapi BM25 model.</p>
|
||||
|
||||
<p>{@link org.apache.lucene.search.similarities.SimilarityBase} provides a basic
|
||||
implementation of the Similarity contract and exposes a highly simplified
|
||||
interface, which makes it an ideal starting point for new ranking functions.
|
||||
Lucene ships the following methods built on
|
||||
{@link org.apache.lucene.search.similarities.SimilarityBase}:
|
||||
|
||||
<a name="framework"></a>
|
||||
<ul>
|
||||
<li>Amati and Rijsbergen's {@linkplain org.apache.lucene.search.similarities.DFRSimilarity DFR} framework;</li>
|
||||
<li>Clinchant and Gaussier's {@linkplain org.apache.lucene.search.similarities.IBSimilarity Information-based models}
|
||||
for IR;</li>
|
||||
<li>The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from
|
||||
Zhai and Lafferty's paper.</li>
|
||||
</ul>
|
||||
|
||||
Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not
|
||||
optimized to the same extent as
|
||||
{@link org.apache.lucene.search.similarities.DefaultSimilarity} and
|
||||
{@link org.apache.lucene.search.similarities.BM25Similarity}, a difference in
|
||||
performance is to be expected when using the methods listed above. However,
|
||||
optimizations can always be implemented in subclasses; see
|
||||
<a href="#changingSimilarity">below</a>.</p>
|
||||
|
||||
|
||||
<a name="providers"></a>
|
||||
<h2>Similarity Providers</h2>
|
||||
|
||||
<p>{@link org.apache.lucene.search.similarities.SimilarityProvider}s are factories
|
||||
that return Similarities per-field and compute coordination factors and normalization
|
||||
values for the query.
|
||||
{@link org.apache.lucene.search.similarities.DefaultSimilarityProvider} is the
|
||||
default implementation used by Lucene, geared towards vector-spaced search: it returns
|
||||
{@link org.apache.lucene.search.similarities.DefaultSimilarity} for every field,
|
||||
and implements coordination-level matching and query normalization.
|
||||
{@link org.apache.lucene.search.similarities.BasicSimilarityProvider} is geared towards
|
||||
non-vector-space models and does not implement coordination-level matching or query
|
||||
normalization. It is a convenience implementation that returns an arbitrary
|
||||
{@link org.apache.lucene.search.similarities.Similarity} for every field.
|
||||
You can write your own SimilarityProvider to return different Similarities for different
|
||||
fields: for example you might want to use different parameter values for different fields,
|
||||
or maybe even entirely different ranking algorithms.
|
||||
</p>
|
||||
|
||||
|
||||
<a name="changingSimilarity"></a>
|
||||
<h2>Changing Similarity</h2>
|
||||
|
||||
<p>Chances are the available Similarities are sufficient for all
|
||||
your searching needs.
|
||||
However, in some applications it may be necessary to customize your <a
|
||||
href="Similarity.html">Similarity</a> implementation. For instance, some
|
||||
applications do not need to
|
||||
distinguish between shorter and longer documents (see <a
|
||||
href="http://www.gossamer-threads.com/lists/lucene/java-user/38967#38967">a "fair" similarity</a>).</p>
|
||||
|
||||
<p>To change {@link org.apache.lucene.search.similarities.Similarity}, one must do so for both indexing and
|
||||
searching, and the changes must happen before
|
||||
either of these actions take place. Although in theory there is nothing stopping you from changing mid-stream, it
|
||||
just isn't well-defined what is going to happen.
|
||||
</p>
|
||||
|
||||
<p>To make this change, implement your own {@link org.apache.lucene.search.similarities.Similarity} (likely
|
||||
you'll want to simply subclass an existing method, be it
|
||||
{@link org.apache.lucene.search.similarities.DefaultSimilarity} or a descendant of
|
||||
{@link org.apache.lucene.search.similarities.SimilarityBase}) and
|
||||
{@link org.apache.lucene.search.similarities.SimilarityProvider} (or use
|
||||
{@link org.apache.lucene.search.similarities.BasicSimilarityProvider}), and
|
||||
then register the new class by calling
|
||||
{@link org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)}
|
||||
before indexing and
|
||||
{@link org.apache.lucene.search.IndexSearcher#setSimilarityProvider(SimilarityProvider)}
|
||||
before searching.
|
||||
</p>
|
||||
|
||||
<h3>Extending {@linkplain org.apache.lucene.search.similarities.SimilarityBase}</h3>
|
||||
<p>
|
||||
The easiest way to quickly implement a new ranking method is to extend
|
||||
{@link org.apache.lucene.search.similarities.SimilarityBase}, which provides
|
||||
basic implementations for the low level . Subclasses are only required to
|
||||
implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)}
|
||||
and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()}
|
||||
methods.</p>
|
||||
|
||||
<p>Another options is to extend one of the <a href="#framework">frameworks</a>
|
||||
based on {@link org.apache.lucene.search.similarities.SimilarityBase}. These
|
||||
Similarities are implemented modularly, e.g.
|
||||
{@link org.apache.lucene.search.similarities.DFRSimilarity} delegates
|
||||
computation of the three parts of its formula to the classes
|
||||
{@link org.apache.lucene.search.similarities.BasicModel},
|
||||
{@link org.apache.lucene.search.similarities.AfterEffect} and
|
||||
{@link org.apache.lucene.search.similarities.Normalization}. Instead of
|
||||
subclassing the Similarity, one can simply introduce a new basic model and tell
|
||||
{@link org.apache.lucene.search.similarities.DFRSimilarity} to use it.</p>
|
||||
|
||||
<h3>Changing {@linkplain org.apache.lucene.search.similarities.DefaultSimilarity}</h3>
|
||||
<p>
|
||||
If you are interested in use cases for changing your similarity, see the Lucene users's mailing list at <a
|
||||
href="http://www.nabble.com/Overriding-Similarity-tf2128934.html">Overriding Similarity</a>.
|
||||
In summary, here are a few use cases:
|
||||
<ol>
|
||||
<li><p>The <code>SweetSpotSimilarity</code> in
|
||||
<code>org.apache.lucene.misc</code> gives small
|
||||
increases as the frequency increases a small amount
|
||||
and then greater increases when you hit the "sweet spot", i.e. where
|
||||
you think the frequency of terms is more significant.</p></li>
|
||||
<li><p>Overriding tf — In some applications, it doesn't matter what the score of a document is as long as a
|
||||
matching term occurs. In these
|
||||
cases people have overridden Similarity to return 1 from the tf() method.</p></li>
|
||||
<li><p>Changing Length Normalization — By overriding
|
||||
{@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState state)},
|
||||
it is possible to discount how the length of a field contributes
|
||||
to a score. In {@link org.apache.lucene.search.similarities.DefaultSimilarity},
|
||||
lengthNorm = 1 / (numTerms in field)^0.5, but if one changes this to be
|
||||
1 / (numTerms in field), all fields will be treated
|
||||
<a href="http://www.gossamer-threads.com/lists/lucene/java-user/38967#38967">"fairly"</a>.</p></li>
|
||||
</ol>
|
||||
In general, Chris Hostetter sums it up best in saying (from <a
|
||||
href="http://www.gossamer-threads.com/lists/lucene/java-user/39125#39125">the Lucene users's mailing list</a>):
|
||||
<blockquote>[One would override the Similarity in] ... any situation where you know more about your data then just
|
||||
that
|
||||
it's "text" is a situation where it *might* make sense to to override your
|
||||
Similarity method.</blockquote>
|
||||
</p>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -19,11 +19,9 @@ package org.apache.lucene.search.spans;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/**
|
||||
* Public for extension only.
|
||||
|
|
|
@ -21,7 +21,8 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
|
||||
import java.io.IOException;
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.document.StringField;
|
|||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import static org.apache.lucene.util.LuceneTestCase.TEST_VERSION_CURRENT;
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Random;
|
||||
|
@ -35,7 +36,7 @@ public class CheckHits {
|
|||
* different order of operations from the actual scoring method ...
|
||||
* this allows for a small amount of variation
|
||||
*/
|
||||
public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f;
|
||||
public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.02f;
|
||||
|
||||
/**
|
||||
* Tests that all documents up to maxDoc which are *not* in the
|
||||
|
@ -327,6 +328,10 @@ public class CheckHits {
|
|||
if (!deep) return;
|
||||
|
||||
Explanation detail[] = expl.getDetails();
|
||||
// TODO: can we improve this entire method? its really geared to work only with TF/IDF
|
||||
if (expl.getDescription().endsWith("computed from:")) {
|
||||
return; // something more complicated.
|
||||
}
|
||||
if (detail!=null) {
|
||||
if (detail.length==1) {
|
||||
// simple containment, unless its a freq of: (which lets a query explain how the freq is calculated),
|
||||
|
@ -338,7 +343,7 @@ public class CheckHits {
|
|||
// - end with one of: "product of:", "sum of:", "max of:", or
|
||||
// - have "max plus <x> times others" (where <x> is float).
|
||||
float x = 0;
|
||||
String descr = expl.getDescription().toLowerCase();
|
||||
String descr = expl.getDescription().toLowerCase(Locale.ENGLISH);
|
||||
boolean productOf = descr.endsWith("product of:");
|
||||
boolean sumOf = descr.endsWith("sum of:");
|
||||
boolean maxOf = descr.endsWith("max of:");
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.search.similarities.AfterEffect;
|
||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.BasicModel;
|
||||
import org.apache.lucene.search.similarities.BasicModelBE;
|
||||
import org.apache.lucene.search.similarities.BasicModelD;
|
||||
import org.apache.lucene.search.similarities.BasicModelG;
|
||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||
import org.apache.lucene.search.similarities.BasicModelIn;
|
||||
import org.apache.lucene.search.similarities.BasicModelIne;
|
||||
import org.apache.lucene.search.similarities.BasicModelP;
|
||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Distribution;
|
||||
import org.apache.lucene.search.similarities.DistributionLL;
|
||||
import org.apache.lucene.search.similarities.DistributionSPL;
|
||||
import org.apache.lucene.search.similarities.IBSimilarity;
|
||||
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
|
||||
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
|
||||
import org.apache.lucene.search.similarities.Lambda;
|
||||
import org.apache.lucene.search.similarities.LambdaDF;
|
||||
import org.apache.lucene.search.similarities.LambdaTTF;
|
||||
import org.apache.lucene.search.similarities.Normalization;
|
||||
import org.apache.lucene.search.similarities.NormalizationH1;
|
||||
import org.apache.lucene.search.similarities.NormalizationH2;
|
||||
import org.apache.lucene.search.similarities.NormalizationH3;
|
||||
import org.apache.lucene.search.similarities.NormalizationZ;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
public class RandomSimilarityProvider extends DefaultSimilarityProvider {
|
||||
final List<Similarity> knownSims;
|
||||
Map<String,Similarity> previousMappings = new HashMap<String,Similarity>();
|
||||
final int perFieldSeed;
|
||||
final boolean shouldCoord;
|
||||
final boolean shouldQueryNorm;
|
||||
|
||||
public RandomSimilarityProvider(Random random) {
|
||||
perFieldSeed = random.nextInt();
|
||||
shouldCoord = random.nextBoolean();
|
||||
shouldQueryNorm = random.nextBoolean();
|
||||
knownSims = new ArrayList<Similarity>(allSims);
|
||||
Collections.shuffle(knownSims, random);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
if (shouldCoord) {
|
||||
return super.coord(overlap, maxOverlap);
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
if (shouldQueryNorm) {
|
||||
return super.queryNorm(sumOfSquaredWeights);
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized Similarity get(String field) {
|
||||
assert field != null;
|
||||
Similarity sim = previousMappings.get(field);
|
||||
if (sim == null) {
|
||||
sim = knownSims.get(Math.abs(perFieldSeed ^ field.hashCode()) % knownSims.size());
|
||||
previousMappings.put(field, sim);
|
||||
}
|
||||
return sim;
|
||||
}
|
||||
|
||||
// all the similarities that we rotate through
|
||||
/** The DFR basic models to test. */
|
||||
static BasicModel[] BASIC_MODELS = {
|
||||
new BasicModelBE(), /* TODO: enable new BasicModelD(), */ new BasicModelG(),
|
||||
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
|
||||
/* TODO: enable new BasicModelP() */
|
||||
};
|
||||
/** The DFR aftereffects to test. */
|
||||
static AfterEffect[] AFTER_EFFECTS = {
|
||||
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
|
||||
};
|
||||
/** The DFR normalizations to test. */
|
||||
static Normalization[] NORMALIZATIONS = {
|
||||
new NormalizationH1(), new NormalizationH2(),
|
||||
new NormalizationH3(), new NormalizationZ()
|
||||
// TODO: if we enable NoNormalization, we have to deal with
|
||||
// a couple tests (e.g. TestDocBoost, TestSort) that expect length normalization
|
||||
// new Normalization.NoNormalization()
|
||||
};
|
||||
/** The distributions for IB. */
|
||||
static Distribution[] DISTRIBUTIONS = {
|
||||
new DistributionLL(), new DistributionSPL()
|
||||
};
|
||||
/** Lambdas for IB. */
|
||||
static Lambda[] LAMBDAS = {
|
||||
new LambdaDF(), new LambdaTTF()
|
||||
};
|
||||
static List<Similarity> allSims;
|
||||
static {
|
||||
allSims = new ArrayList<Similarity>();
|
||||
allSims.add(new DefaultSimilarity());
|
||||
allSims.add(new BM25Similarity());
|
||||
for (BasicModel basicModel : BASIC_MODELS) {
|
||||
for (AfterEffect afterEffect : AFTER_EFFECTS) {
|
||||
for (Normalization normalization : NORMALIZATIONS) {
|
||||
allSims.add(new DFRSimilarity(basicModel, afterEffect, normalization));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (Distribution distribution : DISTRIBUTIONS) {
|
||||
for (Lambda lambda : LAMBDAS) {
|
||||
for (Normalization normalization : NORMALIZATIONS) {
|
||||
allSims.add(new IBSimilarity(distribution, lambda, normalization));
|
||||
}
|
||||
}
|
||||
}
|
||||
/* TODO: enable Dirichlet
|
||||
allSims.add(new LMDirichletSimilarity()); */
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized String toString() {
|
||||
return "RandomSimilarityProvider(queryNorm=" + shouldQueryNorm + ",coord=" + shouldCoord + "): " + previousMappings.toString();
|
||||
}
|
||||
}
|
|
@ -52,6 +52,8 @@ import org.apache.lucene.search.FieldCache;
|
|||
import org.apache.lucene.search.FieldCache.CacheEntry;
|
||||
import org.apache.lucene.search.AssertingIndexSearcher;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.RandomSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.FlushInfo;
|
||||
|
@ -209,6 +211,8 @@ public abstract class LuceneTestCase extends Assert {
|
|||
private static Codec codec;
|
||||
// default codec provider
|
||||
private static CodecProvider savedCodecProvider;
|
||||
|
||||
private static SimilarityProvider similarityProvider;
|
||||
|
||||
private static Locale locale;
|
||||
private static Locale savedLocale;
|
||||
|
@ -393,6 +397,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
savedTimeZone = TimeZone.getDefault();
|
||||
timeZone = TEST_TIMEZONE.equals("random") ? randomTimeZone(random) : TimeZone.getTimeZone(TEST_TIMEZONE);
|
||||
TimeZone.setDefault(timeZone);
|
||||
similarityProvider = new RandomSimilarityProvider(random);
|
||||
testsFailed = false;
|
||||
}
|
||||
|
||||
|
@ -467,6 +472,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
/** print some useful debugging information about the environment */
|
||||
private static void printDebuggingInformation(String codecDescription) {
|
||||
System.err.println("NOTE: test params are: codec=" + codecDescription +
|
||||
", sim=" + similarityProvider +
|
||||
", locale=" + locale +
|
||||
", timezone=" + (timeZone == null ? "(null)" : timeZone.getID()));
|
||||
System.err.println("NOTE: all tests run in this JVM:");
|
||||
|
@ -922,6 +928,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
/** create a new index writer config with random defaults using the specified random */
|
||||
public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) {
|
||||
IndexWriterConfig c = new IndexWriterConfig(v, a);
|
||||
c.setSimilarityProvider(similarityProvider);
|
||||
if (r.nextBoolean()) {
|
||||
c.setMergeScheduler(new SerialMergeScheduler());
|
||||
}
|
||||
|
@ -1249,7 +1256,9 @@ public abstract class LuceneTestCase extends Assert {
|
|||
if (maybeWrap && rarely()) {
|
||||
r = new SlowMultiReaderWrapper(r);
|
||||
}
|
||||
return random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext());
|
||||
IndexSearcher ret = random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext());
|
||||
ret.setSimilarityProvider(similarityProvider);
|
||||
return ret;
|
||||
} else {
|
||||
int threads = 0;
|
||||
final ExecutorService ex = (random.nextBoolean()) ? null
|
||||
|
@ -1258,7 +1267,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
if (ex != null && VERBOSE) {
|
||||
System.out.println("NOTE: newSearcher using ExecutorService with " + threads + " threads");
|
||||
}
|
||||
return random.nextBoolean() ?
|
||||
IndexSearcher ret = random.nextBoolean() ?
|
||||
new AssertingIndexSearcher(r, ex) {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
|
@ -1272,6 +1281,8 @@ public abstract class LuceneTestCase extends Assert {
|
|||
shutdownExecutorService(ex);
|
||||
}
|
||||
};
|
||||
ret.setSimilarityProvider(similarityProvider);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,13 +35,13 @@ import org.apache.lucene.document.StringField;
|
|||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.NumericRangeQuery;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.store.CompoundFileDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
|
|
@ -27,11 +27,11 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.store.CompoundFileDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
|
|
@ -40,9 +40,9 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.IndexReader.FieldOption;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
|
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
|
|
|
@ -29,10 +29,10 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -23,10 +23,11 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
|
|
@ -35,11 +35,13 @@ import org.apache.lucene.document.FieldType;
|
|||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
|
|
|
@ -26,8 +26,8 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
|
|
@ -27,9 +27,9 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
|
|
@ -27,10 +27,10 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -32,6 +32,9 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -234,7 +235,8 @@ public class TestParallelReader extends LuceneTestCase {
|
|||
w.addDocument(d2);
|
||||
w.close();
|
||||
|
||||
return new IndexSearcher(dir, false);
|
||||
IndexReader ir = IndexReader.open(dir, false);
|
||||
return newSearcher(ir);
|
||||
}
|
||||
|
||||
// Fields 1 & 2 in one index, 3 & 4 in other, with ParallelReader:
|
||||
|
|
|
@ -25,9 +25,9 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
|
|
@ -20,9 +20,11 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.Similarity.Stats;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity.Stats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TermContext;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
|
|
|
@ -26,6 +26,8 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
|
|
|
@ -27,6 +27,9 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.NamedThreadFactory;
|
||||
|
@ -72,6 +75,21 @@ public class TestBooleanQuery extends LuceneTestCase {
|
|||
|
||||
IndexReader r = w.getReader();
|
||||
IndexSearcher s = newSearcher(r);
|
||||
// this test relies upon coord being the default implementation,
|
||||
// otherwise scores are different!
|
||||
final SimilarityProvider delegate = s.getSimilarityProvider();
|
||||
s.setSimilarityProvider(new DefaultSimilarityProvider() {
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return delegate.queryNorm(sumOfSquaredWeights);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity get(String field) {
|
||||
return delegate.get(field);
|
||||
}
|
||||
});
|
||||
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD);
|
||||
|
||||
|
@ -81,7 +99,7 @@ public class TestBooleanQuery extends LuceneTestCase {
|
|||
subQuery.setBoost(0);
|
||||
q.add(subQuery, BooleanClause.Occur.SHOULD);
|
||||
float score2 = s.search(q, 10).getMaxScore();
|
||||
assertEquals(score*.5, score2, 1e-6);
|
||||
assertEquals(score*.5F, score2, 1e-6);
|
||||
|
||||
// LUCENE-2617: make sure that a clause not in the index still contributes to the score via coord factor
|
||||
BooleanQuery qq = (BooleanQuery)q.clone();
|
||||
|
@ -91,14 +109,14 @@ public class TestBooleanQuery extends LuceneTestCase {
|
|||
phrase.setBoost(0);
|
||||
qq.add(phrase, BooleanClause.Occur.SHOULD);
|
||||
score2 = s.search(qq, 10).getMaxScore();
|
||||
assertEquals(score*(1.0/3), score2, 1e-6);
|
||||
assertEquals(score*(1/3F), score2, 1e-6);
|
||||
|
||||
// now test BooleanScorer2
|
||||
subQuery = new TermQuery(new Term("field", "b"));
|
||||
subQuery.setBoost(0);
|
||||
q.add(subQuery, BooleanClause.Occur.MUST);
|
||||
score2 = s.search(q, 10).getMaxScore();
|
||||
assertEquals(score*(2.0/3), score2, 1e-6);
|
||||
assertEquals(score*(2/3F), score2, 1e-6);
|
||||
|
||||
// PhraseQuery w/ no terms added returns a null scorer
|
||||
PhraseQuery pq = new PhraseQuery();
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.spans.*;
|
||||
|
||||
/**
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -29,6 +29,10 @@ import org.apache.lucene.index.FieldInvertState;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Weight.ScorerContext;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.text.DecimalFormat;
|
||||
|
|
|
@ -56,7 +56,8 @@ public class TestDocBoost extends LuceneTestCase {
|
|||
|
||||
final float[] scores = new float[4];
|
||||
|
||||
newSearcher(reader).search
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.search
|
||||
(new TermQuery(new Term("field", "word")),
|
||||
new Collector() {
|
||||
private int base = 0;
|
||||
|
@ -82,7 +83,10 @@ public class TestDocBoost extends LuceneTestCase {
|
|||
float lastScore = 0.0f;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
assertTrue(scores[i] > lastScore);
|
||||
if (VERBOSE) {
|
||||
System.out.println(searcher.explain(new TermQuery(new Term("field", "word")), i));
|
||||
}
|
||||
assertTrue("score: " + scores[i] + " should be > lastScore: " + lastScore, scores[i] > lastScore);
|
||||
lastScore = scores[i];
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,9 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.index.values.IndexDocValues.Source;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -71,13 +74,24 @@ public class TestDocValuesScoring extends LuceneTestCase {
|
|||
|
||||
// no boosting
|
||||
IndexSearcher searcher1 = newSearcher(ir);
|
||||
final SimilarityProvider base = searcher1.getSimilarityProvider();
|
||||
// boosting
|
||||
IndexSearcher searcher2 = newSearcher(ir);
|
||||
searcher2.setSimilarityProvider(new DefaultSimilarityProvider() {
|
||||
final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost");
|
||||
searcher2.setSimilarityProvider(new SimilarityProvider() {
|
||||
final Similarity fooSim = new BoostingSimilarity(base.get("foo"), "foo_boost");
|
||||
|
||||
public Similarity get(String field) {
|
||||
return "foo".equals(field) ? fooSim : super.get(field);
|
||||
return "foo".equals(field) ? fooSim : base.get(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return base.coord(overlap, maxOverlap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return base.queryNorm(sumOfSquaredWeights);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.FieldValueHitQueue.Entry;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -41,7 +42,8 @@ public class TestElevationComparator extends LuceneTestCase {
|
|||
directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).
|
||||
setMaxBufferedDocs(2).
|
||||
setMergePolicy(newLogMergePolicy(1000))
|
||||
setMergePolicy(newLogMergePolicy(1000)).
|
||||
setSimilarityProvider(new DefaultSimilarityProvider())
|
||||
);
|
||||
writer.addDocument(adoc(new String[] {"id", "a", "title", "ipod", "str_s", "a"}));
|
||||
writer.addDocument(adoc(new String[] {"id", "b", "title", "ipod ipod", "str_s", "b"}));
|
||||
|
@ -54,6 +56,7 @@ public class TestElevationComparator extends LuceneTestCase {
|
|||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
searcher.setSimilarityProvider(new DefaultSimilarityProvider());
|
||||
|
||||
runTest(searcher, true);
|
||||
runTest(searcher, false);
|
||||
|
|
|
@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -104,6 +107,21 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("TEST: searcher=" + searcher);
|
||||
}
|
||||
// even though this uses a boost-only rewrite, this test relies upon queryNorm being the default implementation,
|
||||
// otherwise scores are different!
|
||||
final SimilarityProvider delegate = searcher.getSimilarityProvider();
|
||||
searcher.setSimilarityProvider(new DefaultSimilarityProvider() {
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return delegate.coord(overlap, maxOverlap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity get(String field) {
|
||||
return delegate.get(field);
|
||||
}
|
||||
});
|
||||
|
||||
writer.close();
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
|
|
|
@ -37,6 +37,9 @@ import org.apache.lucene.document.StringField;
|
|||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
|
|
@ -26,6 +26,9 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.SimilarityProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -169,6 +172,19 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
|
|||
|
||||
// test for correct application of query normalization
|
||||
// must use a non score normalizing method for this.
|
||||
|
||||
final SimilarityProvider delegate = search.getSimilarityProvider();
|
||||
search.setSimilarityProvider(new DefaultSimilarityProvider() {
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return delegate.coord(overlap, maxOverlap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity get(String field) {
|
||||
return delegate.get(field);
|
||||
}
|
||||
});
|
||||
Query q = csrq("data", "1", "6", T, T);
|
||||
q.setBoost(100);
|
||||
search.search(q, null, new Collector() {
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
|
|||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -342,7 +343,10 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
|
||||
public void testSlopScoring() throws IOException {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
|
||||
.setMergePolicy(newLogMergePolicy())
|
||||
.setSimilarityProvider(new DefaultSimilarityProvider()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newField("field", "foo firstname lastname foo", TextField.TYPE_STORED));
|
||||
|
@ -360,6 +364,7 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarityProvider(new DefaultSimilarityProvider());
|
||||
PhraseQuery query = new PhraseQuery();
|
||||
query.add(new Term("field", "firstname"));
|
||||
query.add(new Term("field", "lastname"));
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue