diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2d577d09af9..abd9e84054c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -495,6 +495,34 @@ New features * LUCENE-3423: add Terms.getDocCount(), which returns the number of documents that have at least one term for a field. (Yonik Seeley, Robert Muir) +* LUCENE-2959: Added a variety of different relevance ranking systems to Lucene. + + - Added Okapi BM25, Language Models, Divergence from Randomness, and + Information-Based Models. The models are pluggable, support all of lucene's + features (boosts, slops, explanations, etc) and queries (spans, etc). + + - All models default to the same index-time norm encoding as DefaultSimilarity: + so you can easily try these out/switch back and forth/run experiments and + comparisons without reindexing. Note: most of the models do rely upon index + statistics that are new in Lucene 4.0, so for existing 3.x indexes its a good + idea to upgrade your index to the new format with IndexUpgrader first. + + - Added a new subclass SimilarityBase which provides a simplified API + for plugging in new ranking algorithms without dealing with all of the + nuances and implementation details of Lucene. + + - Added a new helper class BasicSimilarityProvider that just applies one + scoring algorithm to all fields, with queryNorm() and coord() returning 1. + In general, it is recommended to disable coord() when using the new models. + For example, to use BM25 for all fields: + searcher.setSimilarityProvider(new BasicSimilarityProvider(new BM25Similarity())); + + If you instead want to apply different similarities (e.g. ones with different + parameter values or different algorithms entirely) to different fields, implement + SimilarityProvider with your per-field logic. + + (David Mark Nemeskey via Robert Muir) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index 01512b149a4..d989eaf264f 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -43,7 +43,7 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.AttributeImpl; diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 7d6e050b60c..09275a5c8bc 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -57,8 +57,8 @@ import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.RAMDirectory; // for javadocs import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java index 4322f0ad333..133a29c7710 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java @@ -22,9 +22,9 @@ import java.util.Date; import java.util.List; import java.util.ArrayList; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index f1ac1459532..f3321eff818 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -17,7 +17,7 @@ package org.apache.lucene.misc; -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.index.FieldInvertState; /** diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java index ac28283f061..080e2fecd13 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java @@ -26,13 +26,13 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java index 0e9732c4a91..d1457b0b3e4 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java @@ -18,11 +18,11 @@ package org.apache.lucene.misc; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.FieldInvertState; diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java index 6a18043cddc..da4e704e905 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java @@ -31,13 +31,13 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MultiNorms; import org.apache.lucene.index.Term; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index bef95219193..04942968f68 100644 --- a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -31,6 +31,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; diff --git a/lucene/src/java/org/apache/lucene/document/Field.java b/lucene/src/java/org/apache/lucene/document/Field.java index f82c9adc3c7..c24cba94c67 100644 --- a/lucene/src/java/org/apache/lucene/document/Field.java +++ b/lucene/src/java/org/apache/lucene/document/Field.java @@ -223,14 +223,14 @@ public class Field implements IndexableField { * document. * *

The boost is used to compute the norm factor for the field. By - * default, in the {@link org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, + * default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)} method, * the boost value is multiplied by the length normalization factor and then - * rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the + * rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * - * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) - * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float) + * @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState) + * @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float) */ public void setBoost(float boost) { this.boost = boost; diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java index 945503a6310..a63b430e542 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -32,7 +32,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; import org.apache.lucene.index.DocumentsWriterPerThreadPool.ThreadState; import org.apache.lucene.index.FieldInfos.FieldNumberBiMap; import org.apache.lucene.search.Query; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 09983bbff2a..4eec2723b05 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -26,7 +26,7 @@ import java.text.NumberFormat; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index a25b9d82b00..e94b3afd24e 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -32,7 +32,7 @@ import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.search.FieldCache; // javadocs -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.*; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; @@ -1012,7 +1012,7 @@ public abstract class IndexReader implements Cloneable,Closeable { * * @see #norms(String) * @see Similarity#computeNorm(FieldInvertState) - * @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte) + * @see org.apache.lucene.search.similarities.DefaultSimilarity#decodeNormValue(byte) * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java index 700dbb07145..326a1d8da5e 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java @@ -22,7 +22,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.util.Version; /** diff --git a/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java index 9a0612c7139..825bc55abd7 100644 --- a/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java @@ -17,7 +17,7 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.ArrayUtil; /** Taps into DocInverter, as an InvertedDocEndConsumer, diff --git a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java index 902ae69c3d1..5cc537f69bc 100644 --- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java @@ -24,7 +24,8 @@ import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs; -import org.apache.lucene.search.Similarity.ExactDocScorer; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.Similarity.ExactDocScorer; import org.apache.lucene.search.TermQuery.TermWeight; import java.io.IOException; diff --git a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java index e707627b3c7..7f7d53df709 100644 --- a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java +++ b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery.BooleanWeight; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.Scorer.ChildScorer; /* See the description in BooleanScorer.java, comparing diff --git a/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java b/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java index caf21e23551..b0a464ef302 100644 --- a/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java @@ -18,7 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.search.Similarity.ExactDocScorer; +import org.apache.lucene.search.similarities.Similarity.ExactDocScorer; import org.apache.lucene.util.ArrayUtil; import java.io.IOException; import java.util.Comparator; diff --git a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java index 08cf2c330d4..0a0454b7096 100644 --- a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.index.*; +import org.apache.lucene.search.similarities.Similarity; final class ExactPhraseScorer extends Scorer { private final int endMinus1; diff --git a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java index efe7cf130e2..d7537a81196 100644 --- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java @@ -38,6 +38,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; // javadoc import org.apache.lucene.util.ReaderUtil; diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 1a65b65ee31..47232cc42f9 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -26,7 +26,8 @@ import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TermContext; @@ -164,8 +165,7 @@ public class MultiPhraseQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - if (termArrays.size() == 0) // optimize zero-term case - return null; + assert !termArrays.isEmpty(); final IndexReader reader = context.reader; final Bits liveDocs = reader.getLiveDocs(); @@ -249,7 +249,11 @@ public class MultiPhraseQuery extends Query { @Override public Query rewrite(IndexReader reader) { - if (termArrays.size() == 1) { // optimize one-term case + if (termArrays.isEmpty()) { + BooleanQuery bq = new BooleanQuery(); + bq.setBoost(getBoost()); + return bq; + } else if (termArrays.size() == 1) { // optimize one-term case Term[] terms = termArrays.get(0); BooleanQuery boq = new BooleanQuery(true); for (int i=0; iA document is considered matching if it contains the phrase-query terms * at "valid" positions. What "valid positions" are diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 882013979c8..8609637e34d 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -20,6 +20,8 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.LinkedHashSet; +import org.apache.lucene.search.similarities.Similarity; + final class SloppyPhraseScorer extends PhraseScorer { private int slop; private PhrasePositions repeats[]; diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index 048588878c0..32fe5f3b1a5 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -28,7 +28,8 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Similarity.ExactDocScorer; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.ExactDocScorer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ReaderUtil; diff --git a/lucene/src/java/org/apache/lucene/search/TermScorer.java b/lucene/src/java/org/apache/lucene/search/TermScorer.java index 066ce66821e..20a7fb4b287 100644 --- a/lucene/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/TermScorer.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.similarities.Similarity; /** Expert: A Scorer for documents matching a Term. */ diff --git a/lucene/src/java/org/apache/lucene/search/Weight.java b/lucene/src/java/org/apache/lucene/search/Weight.java index e99c5a6b5cb..94607cb212e 100644 --- a/lucene/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/src/java/org/apache/lucene/search/Weight.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.search.similarities.SimilarityProvider; /** * Expert: Calculate query weights and build query scorers. diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java index c5f8900a4ff..a90e0d955f5 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java @@ -22,10 +22,10 @@ import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.DefaultSimilarity; // javadocs only import org.apache.lucene.search.Weight; -import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.SpanNearQuery; @@ -52,7 +52,7 @@ import java.util.Iterator; *

* Payload scores are aggregated using a pluggable {@link PayloadFunction}. * - * @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef) + * @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef) */ public class PayloadNearQuery extends SpanNearQuery { protected String fieldName; diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java index 0df4220a4a8..bd8e23978bc 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java @@ -20,16 +20,16 @@ package org.apache.lucene.search.payloads; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.search.DefaultSimilarity; // javadocs only import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.ComplexExplanation; -import org.apache.lucene.search.Similarity.SloppyDocScorer; import org.apache.lucene.search.Weight.ScorerContext; import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; @@ -49,7 +49,7 @@ import java.io.IOException; * which returns 1 by default. *

* Payload scores are aggregated using a pluggable {@link PayloadFunction}. - * @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef) + * @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef) **/ public class PayloadTermQuery extends SpanTermQuery { protected PayloadFunction function; diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java new file mode 100644 index 00000000000..4610f2bd7b3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java @@ -0,0 +1,63 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * This class acts as the base class for the implementations of the first + * normalization of the informative content in the DFR framework. This + * component is also called the after effect and is defined by the + * formula Inf2 = 1 - Prob2, where + * Prob2 measures the information gain. + * + * @see DFRSimilarity + * @lucene.experimental + */ +public abstract class AfterEffect { + /** Returns the aftereffect score. */ + public abstract float score(BasicStats stats, float tfn); + + /** Returns an explanation for the score. */ + public abstract Explanation explain(BasicStats stats, float tfn); + + /** Implementation used when there is no aftereffect. */ + public static final class NoAfterEffect extends AfterEffect { + @Override + public final float score(BasicStats stats, float tfn) { + return 1f; + } + + @Override + public final Explanation explain(BasicStats stats, float tfn) { + return new Explanation(1, "no aftereffect"); + } + + @Override + public String toString() { + return ""; + } + } + + /** + * Subclasses must override this method to return the code of the + * after effect formula. Refer to the original paper for the list. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java new file mode 100644 index 00000000000..b1f4320043c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java @@ -0,0 +1,49 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Model of the information gain based on the ratio of two Bernoulli processes. + * @lucene.experimental + */ +public class AfterEffectB extends AfterEffect { + @Override + public final float score(BasicStats stats, float tfn) { + long F = stats.getTotalTermFreq(); + int n = stats.getDocFreq(); + return (F + 1) / (n * (tfn + 1)); + } + + @Override + public final Explanation explain(BasicStats stats, float tfn) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(score(stats, tfn)); + result.addDetail(new Explanation(tfn, "tfn")); + result.addDetail(new Explanation(stats.getTotalTermFreq(), "totalTermFreq")); + result.addDetail(new Explanation(stats.getDocFreq(), "docFreq")); + return result; + } + + @Override + public String toString() { + return "B"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java new file mode 100644 index 00000000000..54798309744 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java @@ -0,0 +1,45 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Model of the information gain based on Laplace's law of succession. + * @lucene.experimental + */ +public class AfterEffectL extends AfterEffect { + @Override + public final float score(BasicStats stats, float tfn) { + return 1 / (tfn + 1); + } + + @Override + public final Explanation explain(BasicStats stats, float tfn) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(score(stats, tfn)); + result.addDetail(new Explanation(tfn, "tfn")); + return result; + } + + @Override + public String toString() { + return "L"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java new file mode 100644 index 00000000000..c9c542af10c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -0,0 +1,339 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.TermContext; + +/** + * BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker, + * Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3. + * In Proceedings of the Third Text REtrieval Conference (TREC 1994). + * Gaithersburg, USA, November 1994. + * @lucene.experimental + */ +public class BM25Similarity extends Similarity { + private final float k1; + private final float b; + // TODO: should we add a delta like sifaka.cs.uiuc.edu/~ylv2/pub/sigir11-bm25l.pdf ? + + public BM25Similarity(float k1, float b) { + this.k1 = k1; + this.b = b; + } + + /** BM25 with these default values: + *

+ */ + public BM25Similarity() { + this.k1 = 1.2f; + this.b = 0.75f; + } + + /** Implemented as log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5)). */ + protected float idf(int docFreq, int numDocs) { + return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D)); + } + + /** Implemented as 1 / (distance + 1). */ + protected float sloppyFreq(int distance) { + return 1.0f / (distance + 1); + } + + /** The default implementation returns 1 */ + protected float scorePayload(int doc, int start, int end, BytesRef payload) { + return 1; + } + + /** The default implementation computes the average as sumTotalTermFreq / maxDoc, + * or returns 1 if the index does not store sumTotalTermFreq (Lucene 3.x indexes + * or any field that omits frequency information). */ + protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException { + Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field); + if (terms == null) { + // field does not exist; + return 1f; + } + long sumTotalTermFreq = terms.getSumTotalTermFreq(); + long maxdoc = searcher.maxDoc(); + return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc); + } + + /** The default implementation encodes boost / sqrt(length) + * with {@link SmallFloat#floatToByte315(float)}. This is compatible with + * Lucene's default implementation. If you change this, then you should + * change {@link #decodeNormValue(byte)} to match. */ + protected byte encodeNormValue(float boost, int fieldLength) { + return SmallFloat.floatToByte315(boost / (float) Math.sqrt(fieldLength)); + } + + /** The default implementation returns 1 / f2 + * where f is {@link SmallFloat#byte315ToFloat(byte)}. */ + protected float decodeNormValue(byte b) { + return NORM_TABLE[b & 0xFF]; + } + + // Default true + protected boolean discountOverlaps = true; + + /** Determines whether overlap tokens (Tokens with 0 position increment) are + * ignored when computing norm. By default this is true, meaning overlap + * tokens do not count when computing norms. */ + public void setDiscountOverlaps(boolean v) { + discountOverlaps = v; + } + + /** @see #setDiscountOverlaps */ + public boolean getDiscountOverlaps() { + return discountOverlaps; + } + + /** Cache of decoded bytes. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + float f = SmallFloat.byte315ToFloat((byte)i); + NORM_TABLE[i] = 1.0f / (f*f); + } + } + + @Override + public final byte computeNorm(FieldInvertState state) { + final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength(); + return encodeNormValue(state.getBoost(), numTerms); + } + + public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException { + final int df = stats.docFreq(); + final int max = searcher.maxDoc(); + final float idf = idf(df, max); + return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); + } + + public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException { + final int max = searcher.maxDoc(); + float idf = 0.0f; + final Explanation exp = new Explanation(); + exp.setDescription("idf(), sum of:"); + for (final TermContext stat : stats ) { + final int df = stat.docFreq(); + final float termIdf = idf(df, max); + exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); + idf += termIdf; + } + exp.setValue(idf); + return exp; + } + + @Override + public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException { + Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher); + + float avgdl = avgFieldLength(searcher, fieldName); + + // compute freq-independent part of bm25 equation across all norm values + float cache[] = new float[256]; + for (int i = 0; i < cache.length; i++) { + cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); + } + return new BM25Stats(idf, queryBoost, avgdl, cache); + } + + @Override + public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + final byte[] norms = context.reader.norms(fieldName); + return norms == null + ? new ExactBM25DocScorerNoNorms((BM25Stats)stats) + : new ExactBM25DocScorer((BM25Stats)stats, norms); + } + + @Override + public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + return new SloppyBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName)); + } + + private class ExactBM25DocScorer extends ExactDocScorer { + private final BM25Stats stats; + private final float weightValue; + private final byte[] norms; + private final float[] cache; + + ExactBM25DocScorer(BM25Stats stats, byte norms[]) { + assert norms != null; + this.stats = stats; + this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1) + this.cache = stats.cache; + this.norms = norms; + } + + @Override + public float score(int doc, int freq) { + return weightValue * freq / (freq + cache[norms[doc] & 0xFF]); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + } + + /** there are no norms, we act as if b=0 */ + private class ExactBM25DocScorerNoNorms extends ExactDocScorer { + private final BM25Stats stats; + private final float weightValue; + private static final int SCORE_CACHE_SIZE = 32; + private float[] scoreCache = new float[SCORE_CACHE_SIZE]; + + ExactBM25DocScorerNoNorms(BM25Stats stats) { + this.stats = stats; + this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1) + for (int i = 0; i < SCORE_CACHE_SIZE; i++) + scoreCache[i] = weightValue * i / (i + k1); + } + + @Override + public float score(int doc, int freq) { + // TODO: maybe score cache is more trouble than its worth? + return freq < SCORE_CACHE_SIZE // check cache + ? scoreCache[freq] // cache hit + : weightValue * freq / (freq + k1); // cache miss + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, null); + } + } + + private class SloppyBM25DocScorer extends SloppyDocScorer { + private final BM25Stats stats; + private final float weightValue; // boost * idf * (k1 + 1) + private final byte[] norms; + private final float[] cache; + + SloppyBM25DocScorer(BM25Stats stats, byte norms[]) { + this.stats = stats; + this.weightValue = stats.weight * (k1 + 1); + this.cache = stats.cache; + this.norms = norms; + } + + @Override + public float score(int doc, float freq) { + // if there are no norms, we act as if b=0 + float norm = norms == null ? k1 : cache[norms[doc] & 0xFF]; + return weightValue * freq / (freq + norm); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + + @Override + public float computeSlopFactor(int distance) { + return sloppyFreq(distance); + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return scorePayload(doc, start, end, payload); + } + } + + /** Collection statistics for the BM25 model. */ + private static class BM25Stats extends Stats { + /** BM25's idf */ + private final Explanation idf; + /** The average document length. */ + private final float avgdl; + /** query's inner boost */ + private final float queryBoost; + /** weight (idf * boost) */ + private float weight; + /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ + private final float cache[]; + + BM25Stats(Explanation idf, float queryBoost, float avgdl, float cache[]) { + this.idf = idf; + this.queryBoost = queryBoost; + this.avgdl = avgdl; + this.cache = cache; + } + + @Override + public float getValueForNormalization() { + // we return a TF-IDF like normalization to be nice, but we don't actually normalize ourselves. + final float queryWeight = idf.getValue() * queryBoost; + return queryWeight * queryWeight; + } + + @Override + public void normalize(float queryNorm, float topLevelBoost) { + // we don't normalize with queryNorm at all, we just capture the top-level boost + this.weight = idf.getValue() * queryBoost * topLevelBoost; + } + } + + private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, byte[] norms) { + Explanation result = new Explanation(); + result.setDescription("score(doc="+doc+",freq="+freq+"), product of:"); + + Explanation boostExpl = new Explanation(stats.queryBoost, "boost"); + if (stats.queryBoost != 1.0f) + result.addDetail(boostExpl); + + result.addDetail(stats.idf); + + Explanation tfNormExpl = new Explanation(); + tfNormExpl.setDescription("tfNorm, computed from:"); + tfNormExpl.addDetail(freq); + tfNormExpl.addDetail(new Explanation(k1, "parameter k1")); + if (norms == null) { + tfNormExpl.addDetail(new Explanation(0, "parameter b (norms omitted for field)")); + tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1)); + } else { + float doclen = decodeNormValue(norms[doc]); + tfNormExpl.addDetail(new Explanation(b, "parameter b")); + tfNormExpl.addDetail(new Explanation(stats.avgdl, "avgFieldLength")); + tfNormExpl.addDetail(new Explanation(doclen, "fieldLength")); + tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl))); + } + result.addDetail(tfNormExpl); + result.setValue(boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue()); + return result; + } + + @Override + public String toString() { + return "BM25(k1=" + k1 + ",b=" + b + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java new file mode 100644 index 00000000000..ff4d12d2099 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java @@ -0,0 +1,60 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * This class acts as the base class for the specific basic model + * implementations in the DFR framework. Basic models compute the + * informative content Inf1 = -log2Prob1 + * . + * + * @see DFRSimilarity + * @lucene.experimental + */ +public abstract class BasicModel { + /** Returns the informative content score. */ + public abstract float score(BasicStats stats, float tfn); + + /** + * Returns an explanation for the score. + *

Most basic models use the number of documents and the total term + * frequency to compute Inf1. This method provides a generic + * explanation for such models. Subclasses that use other statistics must + * override this method.

+ */ + public Explanation explain(BasicStats stats, float tfn) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(score(stats, tfn)); + result.addDetail(new Explanation(tfn, "tfn")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + result.addDetail( + new Explanation(stats.getTotalTermFreq(), "totalTermFreq")); + return result; + } + + /** + * Subclasses must override this method to return the code of the + * basic model formula. Refer to the original paper for the list. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java new file mode 100644 index 00000000000..2ac9165d48d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Limiting form of the Bose-Einstein model. The formula used in Lucene differs + * slightly from the one in the original paper: {@code F} is increased by {@code tfn} + * and {@code N} is increased by {@code F} + * @lucene.experimental + */ +public class BasicModelBE extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + double F = stats.getTotalTermFreq() + tfn; + // approximation only holds true when F << N, so we use N += F + double N = F + stats.getNumberOfDocuments(); + return (float)(-log2((N - 1) * Math.E) + + f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn)); + } + + /** The f helper function defined for BE. */ + private final double f(double n, double m) { + return (m + 0.5) * log2(n / m) + (n - m) * log2(n); + } + + @Override + public String toString() { + return "Be"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java new file mode 100644 index 00000000000..04b76a6504a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java @@ -0,0 +1,52 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Implements the approximation of the binomial model with the divergence + * for DFR. The formula used in Lucene differs slightly from the one in the + * original paper: to avoid underflow for small values of {@code N} and + * {@code F}, {@code N} is increased by {@code 1} and + * {@code F} is always increased by {@code tfn}. + *

+ * WARNING: for terms that do not meet the expected random distribution + * (e.g. stopwords), this model may give poor performance, such as + * abnormally high scores for low tf values. + * @lucene.experimental + */ +public class BasicModelD extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, + // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq + // to create a 'normalized' F. + double F = stats.getTotalTermFreq() + tfn; + double phi = (double)tfn / F; + double nphi = 1 - phi; + double p = 1.0 / (stats.getNumberOfDocuments() + 1); + double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p)); + return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi)); + } + + @Override + public String toString() { + return "D"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java new file mode 100644 index 00000000000..edd50b0f00f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java @@ -0,0 +1,41 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs + * slightly from the one in the original paper: {@code F} is increased by {@code tfn} + * and {@code N} is increased by {@code F}. + * @lucene.experimental + */ +public class BasicModelG extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) + double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq()); + // -log(1 / (lambda + 1)) -> log(lambda + 1) + return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda)); + } + + @Override + public String toString() { + return "G"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java new file mode 100644 index 00000000000..3cef323d11c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java @@ -0,0 +1,38 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * An approximation of the I(ne) model. + * @lucene.experimental + */ +public class BasicModelIF extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + int N = stats.getNumberOfDocuments(); + long F = stats.getTotalTermFreq(); + return tfn * (float)(log2(1 + (N + 1) / (F + 0.5))); + } + + @Override + public String toString() { + return "I(F)"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java new file mode 100644 index 00000000000..a61222e5075 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java @@ -0,0 +1,52 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * The basic tf-idf model of randomness. + * @lucene.experimental + */ +public class BasicModelIn extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + int N = stats.getNumberOfDocuments(); + int n = stats.getDocFreq(); + return tfn * (float)(log2((N + 1) / (n + 0.5))); + } + + @Override + public final Explanation explain(BasicStats stats, float tfn) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(score(stats, tfn)); + result.addDetail(new Explanation(tfn, "tfn")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + result.addDetail( + new Explanation(stats.getDocFreq(), "docFreq")); + return result; + } + + @Override + public String toString() { + return "I(n)"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java new file mode 100644 index 00000000000..cdbdeb4edd1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java @@ -0,0 +1,40 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Tf-idf model of randomness, based on a mixture of Poisson and inverse + * document frequency. + * @lucene.experimental + */ +public class BasicModelIne extends BasicModel { + @Override + public final float score(BasicStats stats, float tfn) { + int N = stats.getNumberOfDocuments(); + long F = stats.getTotalTermFreq(); + double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); + return tfn * (float)(log2((N + 1) / (ne + 0.5))); + } + + @Override + public String toString() { + return "I(ne)"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java new file mode 100644 index 00000000000..41a88232ec2 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java @@ -0,0 +1,46 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Implements the Poisson approximation for the binomial model for DFR. + * @lucene.experimental + *

+ * WARNING: for terms that do not meet the expected random distribution + * (e.g. stopwords), this model may give poor performance, such as + * abnormally high scores for low tf values. + */ +public class BasicModelP extends BasicModel { + /** {@code log2(Math.E)}, precomputed. */ + protected static double LOG2_E = log2(Math.E); + + @Override + public final float score(BasicStats stats, float tfn) { + float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments(); + return (float)(tfn * log2(tfn / lambda) + + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E + + 0.5 * log2(2 * Math.PI * tfn)); + } + + @Override + public String toString() { + return "P"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java new file mode 100644 index 00000000000..ac2e191d104 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java @@ -0,0 +1,54 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A simple {@link Similarity} provider that returns in + * {@code get(String field)} the object passed to its constructor. This class + * is aimed at non-VSM models, and therefore both the {@link #coord} and + * {@link #queryNorm} methods return {@code 1}. Use + * {@link DefaultSimilarityProvider} for {@link DefaultSimilarity}. + * @lucene.experimental + */ +public class BasicSimilarityProvider implements SimilarityProvider { + private final Similarity sim; + + public BasicSimilarityProvider(Similarity sim) { + this.sim = sim; + } + + @Override + public float coord(int overlap, int maxOverlap) { + return 1f; + } + + @Override + public float queryNorm(float sumOfSquaredWeights) { + return 1f; + } + + @Override + public Similarity get(String field) { + return sim; + } + + @Override + public String toString() { + return "BasicSimilarityProvider(" + sim + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java new file mode 100644 index 00000000000..a96e7a0aa74 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java @@ -0,0 +1,144 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Terms; + +/** + * Stores all statistics commonly used ranking methods. + * @lucene.experimental + */ +public class BasicStats extends Similarity.Stats { + /** The number of documents. */ + protected int numberOfDocuments; + /** The total number of tokens in the field. */ + protected long numberOfFieldTokens; + /** The average field length. */ + protected float avgFieldLength; + /** The document frequency. */ + protected int docFreq; + /** The total number of occurrences of this term across all documents. */ + protected long totalTermFreq; + + // -------------------------- Boost-related stuff -------------------------- + + /** Query's inner boost. */ + protected final float queryBoost; + /** Any outer query's boost. */ + protected float topLevelBoost; + /** For most Similarities, the immediate and the top level query boosts are + * not handled differently. Hence, this field is just the product of the + * other two. */ + protected float totalBoost; + + /** Constructor. Sets the query boost. */ + public BasicStats(float queryBoost) { + this.queryBoost = queryBoost; + this.totalBoost = queryBoost; + } + + // ------------------------- Getter/setter methods ------------------------- + + /** Returns the number of documents. */ + public int getNumberOfDocuments() { + return numberOfDocuments; + } + + /** Sets the number of documents. */ + public void setNumberOfDocuments(int numberOfDocuments) { + this.numberOfDocuments = numberOfDocuments; + } + + /** + * Returns the total number of tokens in the field. + * @see Terms#getSumTotalTermFreq() + */ + public long getNumberOfFieldTokens() { + return numberOfFieldTokens; + } + + /** + * Sets the total number of tokens in the field. + * @see Terms#getSumTotalTermFreq() + */ + public void setNumberOfFieldTokens(long numberOfFieldTokens) { + this.numberOfFieldTokens = numberOfFieldTokens; + } + + /** Returns the average field length. */ + public float getAvgFieldLength() { + return avgFieldLength; + } + + /** Sets the average field length. */ + public void setAvgFieldLength(float avgFieldLength) { + this.avgFieldLength = avgFieldLength; + } + + /** Returns the document frequency. */ + public int getDocFreq() { + return docFreq; + } + + /** Sets the document frequency. */ + public void setDocFreq(int docFreq) { + this.docFreq = docFreq; + } + + /** Returns the total number of occurrences of this term across all documents. */ + public long getTotalTermFreq() { + return totalTermFreq; + } + + /** Sets the total number of occurrences of this term across all documents. */ + public void setTotalTermFreq(long totalTermFreq) { + this.totalTermFreq = totalTermFreq; + } + + // -------------------------- Boost-related stuff -------------------------- + + /** The square of the raw normalization value. + * @see #rawNormalizationValue() */ + @Override + public float getValueForNormalization() { + float rawValue = rawNormalizationValue(); + return rawValue * rawValue; + } + + /** Computes the raw normalization value. This basic implementation returns + * the query boost. Subclasses may override this method to include other + * factors (such as idf), or to save the value for inclusion in + * {@link #normalize(float, float)}, etc. + */ + protected float rawNormalizationValue() { + return queryBoost; + } + + /** No normalization is done. {@code topLevelBoost} is saved in the object, + * however. */ + @Override + public void normalize(float queryNorm, float topLevelBoost) { + this.topLevelBoost = topLevelBoost; + totalBoost = queryBoost * topLevelBoost; + } + + /** Returns the total boost. */ + public float getTotalBoost() { + return totalBoost; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java new file mode 100644 index 00000000000..6e3039687cc --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -0,0 +1,86 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Implements the divergence from randomness (DFR) framework + * introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002. + * Probabilistic models of information retrieval based on measuring the + * divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002), + * 357-389. + *

The DFR scoring formula is composed of three separate components: the + * basic model, the aftereffect and an additional + * normalization component, represented by the classes + * {@code BasicModel}, {@code AfterEffect} and {@code Normalization}, + * respectively. The names of these classes were chosen to match the names of + * their counterparts in the Terrier IR engine.

+ *

Note that qtf, the multiplicity of term-occurrence in the query, + * is not handled by this implementation.

+ * @see BasicModel + * @see AfterEffect + * @see Normalization + * @lucene.experimental + */ +public class DFRSimilarity extends SimilarityBase { + /** The basic model for information content. */ + protected final BasicModel basicModel; + /** The first normalization of the information content. */ + protected final AfterEffect afterEffect; + /** The term frequency normalization. */ + protected final Normalization normalization; + + public DFRSimilarity(BasicModel basicModel, + AfterEffect afterEffect, + Normalization normalization) { + if (basicModel == null || afterEffect == null || normalization == null) { + throw new NullPointerException("null parameters not allowed."); + } + this.basicModel = basicModel; + this.afterEffect = afterEffect; + this.normalization = normalization; + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + float tfn = normalization.tfn(stats, freq, docLen); + return stats.getTotalBoost() * + basicModel.score(stats, tfn) * afterEffect.score(stats, tfn); + } + + @Override + protected void explain(Explanation expl, + BasicStats stats, int doc, float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + + Explanation normExpl = normalization.explain(stats, freq, docLen); + float tfn = normExpl.getValue(); + expl.addDetail(normExpl); + expl.addDetail(basicModel.explain(stats, tfn)); + expl.addDetail(afterEffect.explain(stats, tfn)); + } + + @Override + public String toString() { + return "DFR " + basicModel.toString() + afterEffect.toString() + + normalization.toString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java similarity index 95% rename from lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java rename to lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java index bd0a90dcc97..a932b3a4f24 100644 --- a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.BytesRef; @@ -85,4 +85,9 @@ public class DefaultSimilarity extends TFIDFSimilarity { public boolean getDiscountOverlaps() { return discountOverlaps; } + + @Override + public String toString() { + return "DefaultSimilarity"; + } } diff --git a/lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarityProvider.java similarity index 96% rename from lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java rename to lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarityProvider.java index 0b336f7eca1..1ef73ff22f9 100644 --- a/lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarityProvider.java @@ -1,4 +1,5 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; + /** * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/src/java/org/apache/lucene/search/similarities/Distribution.java b/lucene/src/java/org/apache/lucene/search/similarities/Distribution.java new file mode 100644 index 00000000000..c31164e7cb7 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/Distribution.java @@ -0,0 +1,45 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * The probabilistic distribution used to model term occurrence + * in information-based models. + * @see IBSimilarity + * @lucene.experimental + */ +public abstract class Distribution { + /** Computes the score. */ + public abstract float score(BasicStats stats, float tfn, float lambda); + + /** Explains the score. Returns the name of the model only, since + * both {@code tfn} and {@code lambda} are explained elsewhere. */ + public Explanation explain(BasicStats stats, float tfn, float lambda) { + return new Explanation( + score(stats, tfn, lambda), getClass().getSimpleName()); + } + + /** + * Subclasses must override this method to return the name of the + * distribution. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/DistributionLL.java b/lucene/src/java/org/apache/lucene/search/similarities/DistributionLL.java new file mode 100644 index 00000000000..b9e0913b93f --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/DistributionLL.java @@ -0,0 +1,37 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Log-logistic distribution. + *

Unlike for DFR, the natural logarithm is used, as + * it is faster to compute and the original paper does not express any + * preference to a specific base.

+ * @lucene.experimental + */ +public class DistributionLL extends Distribution { + @Override + public final float score(BasicStats stats, float tfn, float lambda) { + return (float)-Math.log(lambda / (tfn + lambda)); + } + + @Override + public String toString() { + return "LL"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/DistributionSPL.java b/lucene/src/java/org/apache/lucene/search/similarities/DistributionSPL.java new file mode 100644 index 00000000000..13baf52e4f3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/DistributionSPL.java @@ -0,0 +1,42 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The smoothed power-law (SPL) distribution for the information-based framework + * that is described in the original paper. + *

Unlike for DFR, the natural logarithm is used, as + * it is faster to compute and the original paper does not express any + * preference to a specific base.

+ * @lucene.experimental + */ +public class DistributionSPL extends Distribution { + @Override + public final float score(BasicStats stats, float tfn, float lambda) { + if (lambda == 1f) { + lambda = 0.99f; + } + return (float)-Math.log( + (Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda)); + } + + @Override + public String toString() { + return "SPL"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java new file mode 100644 index 00000000000..eb15e3d6e77 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -0,0 +1,94 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Provides a framework for the family of information-based models, as described + * in Stéphane Clinchant and Eric Gaussier. 2010. Information-based + * models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR + * conference on Research and development in information retrieval (SIGIR '10). + * ACM, New York, NY, USA, 234-241. + *

The retrieval function is of the form RSV(q, d) = ∑ + * -xqw log Prob(Xw ≥ + * tdw | λw), where + *

+ *

+ *

The framework described in the paper has many similarities to the DFR + * framework (see {@link DFRSimilarity}). It is possible that the two + * Similarities will be merged at one point.

+ * @lucene.experimental + */ +public class IBSimilarity extends SimilarityBase { + /** The probabilistic distribution used to model term occurrence. */ + protected final Distribution distribution; + /** The lambda (λw) parameter. */ + protected final Lambda lambda; + /** The term frequency normalization. */ + protected final Normalization normalization; + + public IBSimilarity(Distribution distribution, + Lambda lambda, + Normalization normalization) { + this.distribution = distribution; + this.lambda = lambda; + this.normalization = normalization; + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + return stats.getTotalBoost() * + distribution.score( + stats, + normalization.tfn(stats, freq, docLen), + lambda.lambda(stats)); + } + + @Override + protected void explain( + Explanation expl, BasicStats stats, int doc, float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + Explanation normExpl = normalization.explain(stats, freq, docLen); + Explanation lambdaExpl = lambda.explain(stats); + expl.addDetail(normExpl); + expl.addDetail(lambdaExpl); + expl.addDetail(distribution.explain( + stats, normExpl.getValue(), lambdaExpl.getValue())); + } + + /** + * The name of IB methods follow the pattern + * {@code IB }. The name of the + * distribution is the same as in the original paper; for the names of lambda + * parameters, refer to the javadoc of the {@link Lambda} classes. + */ + @Override + public String toString() { + return "IB " + distribution.toString() + "-" + lambda.toString() + + normalization.toString(); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java new file mode 100644 index 00000000000..ad7e309610a --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -0,0 +1,97 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Bayesian smoothing using Dirichlet priors. From Chengxiang Zhai and John + * Lafferty. 2001. A study of smoothing methods for language models applied to + * Ad Hoc information retrieval. In Proceedings of the 24th annual international + * ACM SIGIR conference on Research and development in information retrieval + * (SIGIR '01). ACM, New York, NY, USA, 334-342. + *

+ * The formula as defined the paper assigns a negative score to documents that + * contain the term, but with fewer occurrences than predicted by the collection + * language model. The Lucene implementation returns {@code 0} for such + * documents. + *

+ * + * @lucene.experimental + */ +public class LMDirichletSimilarity extends LMSimilarity { + /** The μ parameter. */ + private final float mu; + + /** @param mu the μ parameter. */ + public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { + super(collectionModel); + this.mu = mu; + } + + /** @param mu the μ parameter. */ + public LMDirichletSimilarity(float mu) { + this.mu = mu; + } + + /** Instantiates the similarity with the default μ value of 2000. */ + public LMDirichletSimilarity(CollectionModel collectionModel) { + this(collectionModel, 2000); + } + + /** Instantiates the similarity with the default μ value of 2000. */ + public LMDirichletSimilarity() { + this(2000); + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + float score = stats.getTotalBoost() * (float)(Math.log(1 + freq / + (mu * ((LMStats)stats).getCollectionProbability())) + + Math.log(mu / (docLen + mu))); + return score > 0.0f ? score : 0.0f; + } + + @Override + protected void explain(Explanation expl, BasicStats stats, int doc, + float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + + expl.addDetail(new Explanation(mu, "mu")); + Explanation weightExpl = new Explanation(); + weightExpl.setValue((float)Math.log(1 + freq / + (mu * ((LMStats)stats).getCollectionProbability()))); + weightExpl.setDescription("term weight"); + expl.addDetail(weightExpl); + expl.addDetail(new Explanation( + (float)Math.log(mu / (docLen + mu)), "document norm")); + super.explain(expl, stats, doc, freq, docLen); + } + + /** Returns the μ parameter. */ + public float getMu() { + return mu; + } + + @Override + public String getName() { + return String.format("Dirichlet(%f)", getMu()); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java new file mode 100644 index 00000000000..910e769d64c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -0,0 +1,77 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Language model based on the Jelinek-Mercer smoothing method. From Chengxiang + * Zhai and John Lafferty. 2001. A study of smoothing methods for language + * models applied to Ad Hoc information retrieval. In Proceedings of the 24th + * annual international ACM SIGIR conference on Research and development in + * information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342. + *

The model has a single parameter, λ. According to said paper, the + * optimal value depends on both the collection and the query. The optimal value + * is around {@code 0.1} for title queries and {@code 0.7} for long queries.

+ * + * @lucene.experimental + */ +public class LMJelinekMercerSimilarity extends LMSimilarity { + /** The λ parameter. */ + private final float lambda; + + /** @param lambda the λ parameter. */ + public LMJelinekMercerSimilarity( + CollectionModel collectionModel, float lambda) { + super(collectionModel); + this.lambda = lambda; + } + + /** @param lambda the λ parameter. */ + public LMJelinekMercerSimilarity(float lambda) { + this.lambda = lambda; + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + return stats.getTotalBoost() * + (float)Math.log(1 + + ((1 - lambda) * freq / docLen) / + (lambda * ((LMStats)stats).getCollectionProbability())); + } + + @Override + protected void explain(Explanation expl, BasicStats stats, int doc, + float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + expl.addDetail(new Explanation(lambda, "lambda")); + super.explain(expl, stats, doc, freq, docLen); + } + + /** Returns the λ parameter. */ + public float getLambda() { + return lambda; + } + + @Override + public String getName() { + return String.format("Jelinek-Mercer(%f)", getLambda()); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java new file mode 100644 index 00000000000..bf1df961169 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -0,0 +1,155 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.TermContext; + +/** + * Abstract superclass for language modeling Similarities. The following inner + * types are introduced: + *
    + *
  • {@link LMStats}, which defines a new statistic, the probability that + * the collection language model generates the current term;
  • + *
  • {@link CollectionModel}, which is a strategy interface for object that + * compute the collection language model {@code p(w|C)};
  • + *
  • {@link DefaultCollectionModel}, an implementation of the former, that + * computes the term probability as the number of occurrences of the term in the + * collection, divided by the total number of tokens.
  • + *
+ * + * @lucene.experimental + */ +public abstract class LMSimilarity extends SimilarityBase { + /** The collection model. */ + protected final CollectionModel collectionModel; + + /** Creates a new instance with the specified collection language model. */ + public LMSimilarity(CollectionModel collectionModel) { + this.collectionModel = collectionModel; + } + + /** Creates a new instance with the default collection language model. */ + public LMSimilarity() { + this(new DefaultCollectionModel()); + } + + @Override + protected BasicStats newStats(float queryBoost) { + return new LMStats(queryBoost); + } + + /** + * Computes the collection probability of the current term in addition to the + * usual statistics. + */ + @Override + protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, String fieldName, TermContext termContext) throws IOException { + super.fillBasicStats(stats, searcher, fieldName, termContext); + LMStats lmStats = (LMStats) stats; + lmStats.setCollectionProbability(collectionModel.computeProbability(stats)); + } + + @Override + protected void explain(Explanation expl, BasicStats stats, int doc, + float freq, float docLen) { + expl.addDetail(new Explanation(collectionModel.computeProbability(stats), + "collection probability")); + } + + /** + * Returns the name of the LM method. The values of the parameters should be + * included as well. + *

Used in {@link #toString()}

. + */ + public abstract String getName(); + + /** + * Returns the name of the LM method. If a custom collection model strategy is + * used, its name is included as well. + * @see #getName() + * @see CollectionModel#getName() + * @see DefaultCollectionModel + */ + @Override + public String toString() { + String coll = collectionModel.getName(); + if (coll != null) { + return String.format("LM %s - %s", getName(), coll); + } else { + return String.format("LM %s", getName()); + } + } + + /** Stores the collection distribution of the current term. */ + public static class LMStats extends BasicStats { + /** The probability that the current term is generated by the collection. */ + private float collectionProbability; + + public LMStats(float queryBoost) { + super(queryBoost); + } + + /** + * Returns the probability that the current term is generated by the + * collection. + */ + public final float getCollectionProbability() { + return collectionProbability; + } + + /** + * Sets the probability that the current term is generated by the + * collection. + */ + public final void setCollectionProbability(float collectionProbability) { + this.collectionProbability = collectionProbability; + } + } + + /** A strategy for computing the collection language model. */ + public static interface CollectionModel { + /** + * Computes the probability {@code p(w|C)} according to the language model + * strategy for the current term. + */ + public float computeProbability(BasicStats stats); + + /** The name of the collection model strategy. */ + public String getName(); + } + + /** + * Models {@code p(w|C)} as the number of occurrences of the term in the + * collection, divided by the total number of tokens {@code + 1}. + */ + public static class DefaultCollectionModel implements CollectionModel { + @Override + public float computeProbability(BasicStats stats) { + return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1); + } + + @Override + public String getName() { + return null; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java b/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java new file mode 100644 index 00000000000..64b8c34fd85 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java @@ -0,0 +1,42 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * The lambda (λw) parameter in information-based + * models. + * @see IBSimilarity + * @lucene.experimental + */ +public abstract class Lambda { + /** Computes the lambda parameter. */ + public abstract float lambda(BasicStats stats); + /** Explains the lambda parameter. */ + public abstract Explanation explain(BasicStats stats); + + /** + * Subclasses must override this method to return the code of the lambda + * formula. Since the original paper is not very clear on this matter, and + * also uses the DFR naming scheme incorrectly, the codes here were chosen + * arbitrarily. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java new file mode 100644 index 00000000000..7e4a8240b87 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Computes lambda as {@code totalTermFreq / numberOfDocuments}. + * @lucene.experimental + */ +public class LambdaDF extends Lambda { + @Override + public final float lambda(BasicStats stats) { + return (float)stats.getDocFreq() / stats.getNumberOfDocuments(); + } + + @Override + public final Explanation explain(BasicStats stats) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(lambda(stats)); + result.addDetail( + new Explanation(stats.getDocFreq(), "docFreq")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + return result; + } + + @Override + public String toString() { + return "D"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java new file mode 100644 index 00000000000..25c55bd72ce --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Computes lambda as {@code docFreq / numberOfDocuments}. + * @lucene.experimental + */ +public class LambdaTTF extends Lambda { + @Override + public final float lambda(BasicStats stats) { + return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments(); + } + + @Override + public final Explanation explain(BasicStats stats) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(lambda(stats)); + result.addDetail( + new Explanation(stats.getTotalTermFreq(), "totalTermFreq")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + return result; + } + + @Override + public String toString() { + return "L"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java new file mode 100644 index 00000000000..46546ada55e --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java @@ -0,0 +1,159 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; + +/** + * Implements the CombSUM method for combining evidence from multiple + * similarity values described in: Joseph A. Shaw, Edward A. Fox. + * In Text REtrieval Conference (1993), pp. 243-252 + * @lucene.experimental + */ +public class MultiSimilarity extends Similarity { + protected final Similarity sims[]; + + public MultiSimilarity(Similarity sims[]) { + this.sims = sims; + } + + @Override + public byte computeNorm(FieldInvertState state) { + return sims[0].computeNorm(state); + } + + @Override + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { + Stats subStats[] = new Stats[sims.length]; + for (int i = 0; i < subStats.length; i++) { + subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts); + } + return new MultiStats(subStats); + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + ExactDocScorer subScorers[] = new ExactDocScorer[sims.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = sims[i].exactDocScorer(((MultiStats)stats).subStats[i], fieldName, context); + } + return new MultiExactDocScorer(subScorers); + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + SloppyDocScorer subScorers[] = new SloppyDocScorer[sims.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = sims[i].sloppyDocScorer(((MultiStats)stats).subStats[i], fieldName, context); + } + return new MultiSloppyDocScorer(subScorers); + } + + public static class MultiExactDocScorer extends ExactDocScorer { + private final ExactDocScorer subScorers[]; + + MultiExactDocScorer(ExactDocScorer subScorers[]) { + this.subScorers = subScorers; + } + + @Override + public float score(int doc, int freq) { + float sum = 0.0f; + for (ExactDocScorer subScorer : subScorers) { + sum += subScorer.score(doc, freq); + } + return sum; + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:"); + for (ExactDocScorer subScorer : subScorers) { + expl.addDetail(subScorer.explain(doc, freq)); + } + return expl; + } + } + + public static class MultiSloppyDocScorer extends SloppyDocScorer { + private final SloppyDocScorer subScorers[]; + + MultiSloppyDocScorer(SloppyDocScorer subScorers[]) { + this.subScorers = subScorers; + } + + @Override + public float score(int doc, float freq) { + float sum = 0.0f; + for (SloppyDocScorer subScorer : subScorers) { + sum += subScorer.score(doc, freq); + } + return sum; + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:"); + for (SloppyDocScorer subScorer : subScorers) { + expl.addDetail(subScorer.explain(doc, freq)); + } + return expl; + } + + @Override + public float computeSlopFactor(int distance) { + return subScorers[0].computeSlopFactor(distance); + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return subScorers[0].computePayloadFactor(doc, start, end, payload); + } + } + + public static class MultiStats extends Stats { + final Stats subStats[]; + + MultiStats(Stats subStats[]) { + this.subStats = subStats; + } + + @Override + public float getValueForNormalization() { + float sum = 0.0f; + for (Stats stat : subStats) { + sum += stat.getValueForNormalization(); + } + return sum / subStats.length; + } + + @Override + public void normalize(float queryNorm, float topLevelBoost) { + for (Stats stat : subStats) { + stat.normalize(queryNorm, topLevelBoost); + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java b/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java new file mode 100644 index 00000000000..f635baa1c48 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java @@ -0,0 +1,75 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * This class acts as the base class for the implementations of the term + * frequency normalization methods in the DFR framework. + * + * @see DFRSimilarity + * @lucene.experimental + */ +public abstract class Normalization { + /** Returns the normalized term frequency. + * @param len the field length. */ + public abstract float tfn(BasicStats stats, float tf, float len); + + /** Returns an explanation for the normalized term frequency. + *

The default normalization methods use the field length of the document + * and the average field length to compute the normalized term frequency. + * This method provides a generic explanation for such methods. + * Subclasses that use other statistics must override this method.

+ */ + public Explanation explain(BasicStats stats, float tf, float len) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(tfn(stats, tf, len)); + result.addDetail(new Explanation(tf, "tf")); + result.addDetail( + new Explanation(stats.getAvgFieldLength(), "avgFieldLength")); + result.addDetail(new Explanation(len, "len")); + return result; + } + + /** Implementation used when there is no normalization. */ + public static final class NoNormalization extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return tf; + } + + @Override + public final Explanation explain(BasicStats stats, float tf, float len) { + return new Explanation(1, "no normalization"); + } + + @Override + public String toString() { + return ""; + } + } + + /** + * Subclasses must override this method to return the code of the + * normalization formula. Refer to the original paper for the list. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java new file mode 100644 index 00000000000..77b18055903 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java @@ -0,0 +1,34 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalization model that assumes a uniform distribution of the term frequency. + * @lucene.experimental + */ +public class NormalizationH1 extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return tf * stats.getAvgFieldLength() / len; + } + + @Override + public String toString() { + return "1"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java new file mode 100644 index 00000000000..9055e6f7f73 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java @@ -0,0 +1,37 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Normalization model in which the term frequency is inversely related to the + * length. + * @lucene.experimental + */ +public class NormalizationH2 extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return (float)(tf * log2(1 + stats.getAvgFieldLength() / len)); + } + + @Override + public String toString() { + return "2"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java new file mode 100644 index 00000000000..97bf86a1135 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Dirichlet Priors normalization + * @lucene.experimental + */ +public class NormalizationH3 extends Normalization { + private final float mu; + + public NormalizationH3() { + this(800F); + } + + public NormalizationH3(float mu) { + this.mu = mu; + } + + @Override + public float tfn(BasicStats stats, float tf, float len) { + return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu; + } + + @Override + public String toString() { + return "3(" + mu + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java new file mode 100644 index 00000000000..9f15288f222 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Pareto-Zipf Normalization + * @lucene.experimental + */ +public class NormalizationZ extends Normalization { + final float z; + + public NormalizationZ() { + this(0.30F); + } + + public NormalizationZ(float z) { + this.z = z; + } + + @Override + public float tfn(BasicStats stats, float tf, float len) { + return (float)(tf * Math.pow(stats.avgFieldLength / len, z)); + } + + @Override + public String toString() { + return "Z(" + z + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java similarity index 96% rename from lucene/src/java/org/apache/lucene/search/Similarity.java rename to lucene/src/java/org/apache/lucene/search/similarities/Similarity.java index fe184be66c8..c0262eb21ca 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -25,6 +25,12 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; // javadoc import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Terms; // javadoc +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanQuery; // javadoc import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SmallFloat; // javadoc @@ -140,7 +146,7 @@ public abstract class Similarity { *

* Term frequencies are integers (the term or phrase's tf) */ - public abstract class ExactDocScorer { + public static abstract class ExactDocScorer { /** * Score a single document * @param doc document id @@ -169,7 +175,7 @@ public abstract class Similarity { *

* Term frequencies are floating point values. */ - public abstract class SloppyDocScorer { + public static abstract class SloppyDocScorer { /** * Score a single document * @param doc document id diff --git a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java new file mode 100644 index 00000000000..4eb2c8c1315 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -0,0 +1,345 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.TermContext; + +/** + * A subclass of {@code Similarity} that provides a simplified API for its + * descendants. Subclasses are only required to implement the {@link #score} + * and {@link #toString()} methods. Implementing + * {@link #explain(Explanation, BasicStats, int, float, float)} is optional, + * inasmuch as SimilarityBase already provides a basic explanation of the score + * and the term frequency. However, implementers of a subclass are encouraged to + * include as much detail about the scoring method as possible. + *

+ * Note: multi-word queries such as phrase queries are scored in a different way + * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for + * the phrase as a whole (since it does not know it), this class instead scores + * phrases as a summation of the individual term scores. + * @lucene.experimental + */ +public abstract class SimilarityBase extends Similarity { + /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ + private static final double LOG_2 = Math.log(2); + + /** @see #setDiscountOverlaps */ + protected boolean discountOverlaps = true; + + /** Determines whether overlap tokens (Tokens with + * 0 position increment) are ignored when computing + * norm. By default this is true, meaning overlap + * tokens do not count when computing norms. + * + * @lucene.experimental + * + * @see #computeNorm + */ + public void setDiscountOverlaps(boolean v) { + discountOverlaps = v; + } + + /** @see #setDiscountOverlaps */ + public boolean getDiscountOverlaps() { + return discountOverlaps; + } + + @Override + public final Stats computeStats(IndexSearcher searcher, String fieldName, + float queryBoost, TermContext... termContexts) throws IOException { + BasicStats stats[] = new BasicStats[termContexts.length]; + for (int i = 0; i < termContexts.length; i++) { + stats[i] = newStats(queryBoost); + fillBasicStats(stats[i], searcher, fieldName, termContexts[i]); + } + return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); + } + + /** Factory method to return a custom stats object */ + protected BasicStats newStats(float queryBoost) { + return new BasicStats(queryBoost); + } + + /** Fills all member fields defined in {@code BasicStats} in {@code stats}. + * Subclasses can override this method to fill additional stats. */ + protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, + String fieldName, TermContext termContext) throws IOException { + IndexReader reader = searcher.getIndexReader(); + int numberOfDocuments = reader.maxDoc(); + + int docFreq = termContext.docFreq(); + long totalTermFreq = termContext.totalTermFreq(); + + // codec does not supply totalTermFreq: substitute docFreq + if (totalTermFreq == -1) { + totalTermFreq = docFreq; + } + + final long numberOfFieldTokens; + final float avgFieldLength; + + Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName); + if (terms == null) { + // field does not exist; + numberOfFieldTokens = 0; + avgFieldLength = 1; + } else { + long sumTotalTermFreq = terms.getSumTotalTermFreq(); + + // We have to provide something if codec doesnt supply these measures, + // or if someone omitted frequencies for the field... negative values cause + // NaN/Inf for some scorers. + if (sumTotalTermFreq == -1) { + numberOfFieldTokens = docFreq; + avgFieldLength = 1; + } else { + numberOfFieldTokens = sumTotalTermFreq; + avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; + } + } + + // TODO: add sumDocFreq for field (numberOfFieldPostings) + stats.setNumberOfDocuments(numberOfDocuments); + stats.setNumberOfFieldTokens(numberOfFieldTokens); + stats.setAvgFieldLength(avgFieldLength); + stats.setDocFreq(docFreq); + stats.setTotalTermFreq(totalTermFreq); + } + + /** + * Scores the document {@code doc}. + *

Subclasses must apply their scoring formula in this class.

+ * @param stats the corpus level statistics. + * @param freq the term frequency. + * @param docLen the document length. + * @return the score. + */ + protected abstract float score(BasicStats stats, float freq, float docLen); + + /** + * Subclasses should implement this method to explain the score. {@code expl} + * already contains the score, the name of the class and the doc id, as well + * as the term frequency and its explanation; subclasses can add additional + * clauses to explain details of their scoring formulae. + *

The default implementation does nothing.

+ * + * @param expl the explanation to extend with details. + * @param stats the corpus level statistics. + * @param doc the document id. + * @param freq the term frequency. + * @param docLen the document length. + */ + protected void explain( + Explanation expl, BasicStats stats, int doc, float freq, float docLen) {} + + /** + * Explains the score. The implementation here provides a basic explanation + * in the format score(name-of-similarity, doc=doc-id, + * freq=term-frequency), computed from:, and + * attaches the score (computed via the {@link #score(BasicStats, float, float)} + * method) and the explanation for the term frequency. Subclasses content with + * this format may add additional details in + * {@link #explain(Explanation, BasicStats, int, float, float)}. + * + * @param stats the corpus level statistics. + * @param doc the document id. + * @param freq the term frequency and its explanation. + * @param docLen the document length. + * @return the explanation. + */ + protected Explanation explain( + BasicStats stats, int doc, Explanation freq, float docLen) { + Explanation result = new Explanation(); + result.setValue(score(stats, freq.getValue(), docLen)); + result.setDescription("score(" + getClass().getSimpleName() + + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:"); + result.addDetail(freq); + + explain(result, stats, doc, freq.getValue(), docLen); + + return result; + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, + AtomicReaderContext context) throws IOException { + byte norms[] = context.reader.norms(fieldName); + + if (stats instanceof MultiSimilarity.MultiStats) { + // a multi term query (e.g. phrase). return the summation, + // scoring almost as if it were boolean query + Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; + ExactDocScorer subScorers[] = new ExactDocScorer[subStats.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = new BasicExactDocScorer((BasicStats)subStats[i], norms); + } + return new MultiSimilarity.MultiExactDocScorer(subScorers); + } else { + return new BasicExactDocScorer((BasicStats) stats, norms); + } + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, + AtomicReaderContext context) throws IOException { + byte norms[] = context.reader.norms(fieldName); + + if (stats instanceof MultiSimilarity.MultiStats) { + // a multi term query (e.g. phrase). return the summation, + // scoring almost as if it were boolean query + Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; + SloppyDocScorer subScorers[] = new SloppyDocScorer[subStats.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = new BasicSloppyDocScorer((BasicStats)subStats[i], norms); + } + return new MultiSimilarity.MultiSloppyDocScorer(subScorers); + } else { + return new BasicSloppyDocScorer((BasicStats) stats, norms); + } + } + + /** + * Subclasses must override this method to return the name of the Similarity + * and preferably the values of parameters (if any) as well. + */ + @Override + public abstract String toString(); + + // ------------------------------ Norm handling ------------------------------ + + /** Norm -> document length map. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + float floatNorm = SmallFloat.byte315ToFloat((byte)i); + NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm); + } + } + + /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */ + @Override + public byte computeNorm(FieldInvertState state) { + final float numTerms; + if (discountOverlaps) + numTerms = state.getLength() - state.getNumOverlap(); + else + numTerms = state.getLength() / state.getBoost(); + return encodeNormValue(state.getBoost(), numTerms); + } + + /** Decodes a normalization factor (document length) stored in an index. + * @see #encodeNormValue(float,float) + */ + protected float decodeNormValue(byte norm) { + return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + /** Encodes the length to a byte via SmallFloat. */ + protected byte encodeNormValue(float boost, float length) { + return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length))); + } + + // ----------------------------- Static methods ------------------------------ + + /** Returns the base two logarithm of {@code x}. */ + public static double log2(double x) { + // Put this to a 'util' class if we need more of these. + return Math.log(x) / LOG_2; + } + + // --------------------------------- Classes --------------------------------- + + /** Delegates the {@link #score(int, int)} and + * {@link #explain(int, Explanation)} methods to + * {@link SimilarityBase#score(BasicStats, float, int)} and + * {@link SimilarityBase#explain(BasicStats, int, Explanation, int)}, + * respectively. + */ + private class BasicExactDocScorer extends ExactDocScorer { + private final BasicStats stats; + private final byte[] norms; + + BasicExactDocScorer(BasicStats stats, byte norms[]) { + this.stats = stats; + this.norms = norms; + } + + @Override + public float score(int doc, int freq) { + // We have to supply something in case norms are omitted + return SimilarityBase.this.score(stats, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return SimilarityBase.this.explain(stats, doc, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + } + + /** Delegates the {@link #score(int, int)} and + * {@link #explain(int, Explanation)} methods to + * {@link SimilarityBase#score(BasicStats, float, int)} and + * {@link SimilarityBase#explain(BasicStats, int, Explanation, int)}, + * respectively. + */ + private class BasicSloppyDocScorer extends SloppyDocScorer { + private final BasicStats stats; + private final byte[] norms; + + BasicSloppyDocScorer(BasicStats stats, byte norms[]) { + this.stats = stats; + this.norms = norms; + } + + @Override + public float score(int doc, float freq) { + // We have to supply something in case norms are omitted + return SimilarityBase.this.score(stats, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + @Override + public Explanation explain(int doc, Explanation freq) { + return SimilarityBase.this.explain(stats, doc, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + + @Override + public float computeSlopFactor(int distance) { + return 1.0f / (distance + 1); + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return 1f; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/SimilarityProvider.java b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java similarity index 96% rename from lucene/src/java/org/apache/lucene/search/SimilarityProvider.java rename to lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java index ef9a034e7eb..e3f6e86dca0 100644 --- a/lucene/src/java/org/apache/lucene/search/SimilarityProvider.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java @@ -1,4 +1,6 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; + +import org.apache.lucene.search.BooleanQuery; /** * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java similarity index 97% rename from lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java rename to lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index 4209dd490cf..163fcb2c966 100644 --- a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,6 +21,10 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TermContext; import org.apache.lucene.util.SmallFloat; @@ -303,13 +307,13 @@ import org.apache.lucene.util.SmallFloat; * two term-queries with that same term and hence the computation would still be correct (although * not very efficient). * The default computation for tf(t in d) in - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) DefaultSimilarity} is: * *
 
* * * *
- * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) tf(t in d)}   =   * * frequency½ @@ -328,13 +332,13 @@ import org.apache.lucene.util.SmallFloat; * idf(t) appears for t in both the query and the document, * hence it is squared in the equation. * The default computation for idf(t) in - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: * *
 
* * * *
- * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)}  =   * * 1 + log ( @@ -376,14 +380,14 @@ import org.apache.lucene.util.SmallFloat; * This is a search time factor computed by the Similarity in effect at search time. * * The default computation in - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} + * {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} * produces a Euclidean norm: *
 
* * * *
* queryNorm(q)   =   - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} + * {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} *   =   * diff --git a/lucene/src/java/org/apache/lucene/search/similarities/package.html b/lucene/src/java/org/apache/lucene/search/similarities/package.html new file mode 100644 index 00000000000..f7d11eeaad2 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/package.html @@ -0,0 +1,174 @@ + + + + + + + +This package contains the various ranking models that can be used in Lucene. The +abstract class {@link org.apache.lucene.search.similarities.Similarity} serves +as the base for ranking functions. For searching, users can employ the models +already implemented or create their own by extending one of the classes in this +package. + +

Table Of Contents

+

+

    +
  1. Summary of the Ranking Methods
  2. +
  3. Similarity Providers
  4. +
  5. Changing the Similarity
  6. +
+

+ + + +

Summary of the Ranking Methods

+ +

{@link org.apache.lucene.search.similarities.DefaultSimilarity} is the original Lucene +scoring function. It is based on a highly optimized Vector Space Model. For more +information, see {@link org.apache.lucene.search.similarities.TFIDFSimilarity}.

+ +

{@link org.apache.lucene.search.similarities.BM25Similarity} is an optimized +implementation of the successful Okapi BM25 model.

+ +

{@link org.apache.lucene.search.similarities.SimilarityBase} provides a basic +implementation of the Similarity contract and exposes a highly simplified +interface, which makes it an ideal starting point for new ranking functions. +Lucene ships the following methods built on +{@link org.apache.lucene.search.similarities.SimilarityBase}: + + +

    +
  • Amati and Rijsbergen's {@linkplain org.apache.lucene.search.similarities.DFRSimilarity DFR} framework;
  • +
  • Clinchant and Gaussier's {@linkplain org.apache.lucene.search.similarities.IBSimilarity Information-based models} + for IR;
  • +
  • The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from + Zhai and Lafferty's paper.
  • +
+ +Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not +optimized to the same extent as +{@link org.apache.lucene.search.similarities.DefaultSimilarity} and +{@link org.apache.lucene.search.similarities.BM25Similarity}, a difference in +performance is to be expected when using the methods listed above. However, +optimizations can always be implemented in subclasses; see +below.

+ + + +

Similarity Providers

+ +

{@link org.apache.lucene.search.similarities.SimilarityProvider}s are factories +that return Similarities per-field and compute coordination factors and normalization +values for the query. +{@link org.apache.lucene.search.similarities.DefaultSimilarityProvider} is the +default implementation used by Lucene, geared towards vector-spaced search: it returns +{@link org.apache.lucene.search.similarities.DefaultSimilarity} for every field, +and implements coordination-level matching and query normalization. +{@link org.apache.lucene.search.similarities.BasicSimilarityProvider} is geared towards +non-vector-space models and does not implement coordination-level matching or query +normalization. It is a convenience implementation that returns an arbitrary +{@link org.apache.lucene.search.similarities.Similarity} for every field. +You can write your own SimilarityProvider to return different Similarities for different +fields: for example you might want to use different parameter values for different fields, +or maybe even entirely different ranking algorithms. +

+ + + +

Changing Similarity

+ +

Chances are the available Similarities are sufficient for all + your searching needs. + However, in some applications it may be necessary to customize your Similarity implementation. For instance, some + applications do not need to + distinguish between shorter and longer documents (see a "fair" similarity).

+ +

To change {@link org.apache.lucene.search.similarities.Similarity}, one must do so for both indexing and + searching, and the changes must happen before + either of these actions take place. Although in theory there is nothing stopping you from changing mid-stream, it + just isn't well-defined what is going to happen. +

+ +

To make this change, implement your own {@link org.apache.lucene.search.similarities.Similarity} (likely + you'll want to simply subclass an existing method, be it + {@link org.apache.lucene.search.similarities.DefaultSimilarity} or a descendant of + {@link org.apache.lucene.search.similarities.SimilarityBase}) and + {@link org.apache.lucene.search.similarities.SimilarityProvider} (or use + {@link org.apache.lucene.search.similarities.BasicSimilarityProvider}), and + then register the new class by calling + {@link org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)} + before indexing and + {@link org.apache.lucene.search.IndexSearcher#setSimilarityProvider(SimilarityProvider)} + before searching. +

+ +

Extending {@linkplain org.apache.lucene.search.similarities.SimilarityBase}

+

+The easiest way to quickly implement a new ranking method is to extend +{@link org.apache.lucene.search.similarities.SimilarityBase}, which provides +basic implementations for the low level . Subclasses are only required to +implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)} +and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()} +methods.

+ +

Another options is to extend one of the frameworks +based on {@link org.apache.lucene.search.similarities.SimilarityBase}. These +Similarities are implemented modularly, e.g. +{@link org.apache.lucene.search.similarities.DFRSimilarity} delegates +computation of the three parts of its formula to the classes +{@link org.apache.lucene.search.similarities.BasicModel}, +{@link org.apache.lucene.search.similarities.AfterEffect} and +{@link org.apache.lucene.search.similarities.Normalization}. Instead of +subclassing the Similarity, one can simply introduce a new basic model and tell +{@link org.apache.lucene.search.similarities.DFRSimilarity} to use it.

+ +

Changing {@linkplain org.apache.lucene.search.similarities.DefaultSimilarity}

+

+ If you are interested in use cases for changing your similarity, see the Lucene users's mailing list at Overriding Similarity. + In summary, here are a few use cases: +

    +
  1. The SweetSpotSimilarity in + org.apache.lucene.misc gives small + increases as the frequency increases a small amount + and then greater increases when you hit the "sweet spot", i.e. where + you think the frequency of terms is more significant.

  2. +
  3. Overriding tf — In some applications, it doesn't matter what the score of a document is as long as a + matching term occurs. In these + cases people have overridden Similarity to return 1 from the tf() method.

  4. +
  5. Changing Length Normalization — By overriding + {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState state)}, + it is possible to discount how the length of a field contributes + to a score. In {@link org.apache.lucene.search.similarities.DefaultSimilarity}, + lengthNorm = 1 / (numTerms in field)^0.5, but if one changes this to be + 1 / (numTerms in field), all fields will be treated + "fairly".

  6. +
+ In general, Chris Hostetter sums it up best in saying (from the Lucene users's mailing list): +
[One would override the Similarity in] ... any situation where you know more about your data then just + that + it's "text" is a situation where it *might* make sense to to override your + Similarity method.
+

+ + + diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java index 573c91baa5b..d97307a150f 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -19,11 +19,9 @@ package org.apache.lucene.search.spans; import java.io.IOException; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; /** * Public for extension only. diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java index 14dba75eb37..e7415477297 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -21,7 +21,8 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; -import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer; import org.apache.lucene.util.TermContext; import java.io.IOException; diff --git a/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java b/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java index f9b73ea5558..1f815d58a54 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java +++ b/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java @@ -33,7 +33,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import static org.apache.lucene.util.LuceneTestCase.TEST_VERSION_CURRENT; diff --git a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java index 36362555e33..46c3e88f193 100644 --- a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java +++ b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.Locale; import java.util.Set; import java.util.TreeSet; import java.util.Random; @@ -35,7 +36,7 @@ public class CheckHits { * different order of operations from the actual scoring method ... * this allows for a small amount of variation */ - public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f; + public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.02f; /** * Tests that all documents up to maxDoc which are *not* in the @@ -327,6 +328,10 @@ public class CheckHits { if (!deep) return; Explanation detail[] = expl.getDetails(); + // TODO: can we improve this entire method? its really geared to work only with TF/IDF + if (expl.getDescription().endsWith("computed from:")) { + return; // something more complicated. + } if (detail!=null) { if (detail.length==1) { // simple containment, unless its a freq of: (which lets a query explain how the freq is calculated), @@ -338,7 +343,7 @@ public class CheckHits { // - end with one of: "product of:", "sum of:", "max of:", or // - have "max plus times others" (where is float). float x = 0; - String descr = expl.getDescription().toLowerCase(); + String descr = expl.getDescription().toLowerCase(Locale.ENGLISH); boolean productOf = descr.endsWith("product of:"); boolean sumOf = descr.endsWith("sum of:"); boolean maxOf = descr.endsWith("max of:"); diff --git a/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java b/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java new file mode 100644 index 00000000000..d7078a6c815 --- /dev/null +++ b/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java @@ -0,0 +1,158 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.lucene.search.similarities.AfterEffect; +import org.apache.lucene.search.similarities.AfterEffectB; +import org.apache.lucene.search.similarities.AfterEffectL; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.BasicModel; +import org.apache.lucene.search.similarities.BasicModelBE; +import org.apache.lucene.search.similarities.BasicModelD; +import org.apache.lucene.search.similarities.BasicModelG; +import org.apache.lucene.search.similarities.BasicModelIF; +import org.apache.lucene.search.similarities.BasicModelIn; +import org.apache.lucene.search.similarities.BasicModelIne; +import org.apache.lucene.search.similarities.BasicModelP; +import org.apache.lucene.search.similarities.DFRSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Distribution; +import org.apache.lucene.search.similarities.DistributionLL; +import org.apache.lucene.search.similarities.DistributionSPL; +import org.apache.lucene.search.similarities.IBSimilarity; +import org.apache.lucene.search.similarities.LMDirichletSimilarity; +import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; +import org.apache.lucene.search.similarities.Lambda; +import org.apache.lucene.search.similarities.LambdaDF; +import org.apache.lucene.search.similarities.LambdaTTF; +import org.apache.lucene.search.similarities.Normalization; +import org.apache.lucene.search.similarities.NormalizationH1; +import org.apache.lucene.search.similarities.NormalizationH2; +import org.apache.lucene.search.similarities.NormalizationH3; +import org.apache.lucene.search.similarities.NormalizationZ; +import org.apache.lucene.search.similarities.Similarity; + +public class RandomSimilarityProvider extends DefaultSimilarityProvider { + final List knownSims; + Map previousMappings = new HashMap(); + final int perFieldSeed; + final boolean shouldCoord; + final boolean shouldQueryNorm; + + public RandomSimilarityProvider(Random random) { + perFieldSeed = random.nextInt(); + shouldCoord = random.nextBoolean(); + shouldQueryNorm = random.nextBoolean(); + knownSims = new ArrayList(allSims); + Collections.shuffle(knownSims, random); + } + + @Override + public float coord(int overlap, int maxOverlap) { + if (shouldCoord) { + return super.coord(overlap, maxOverlap); + } else { + return 1.0f; + } + } + + @Override + public float queryNorm(float sumOfSquaredWeights) { + if (shouldQueryNorm) { + return super.queryNorm(sumOfSquaredWeights); + } else { + return 1.0f; + } + } + + @Override + public synchronized Similarity get(String field) { + assert field != null; + Similarity sim = previousMappings.get(field); + if (sim == null) { + sim = knownSims.get(Math.abs(perFieldSeed ^ field.hashCode()) % knownSims.size()); + previousMappings.put(field, sim); + } + return sim; + } + + // all the similarities that we rotate through + /** The DFR basic models to test. */ + static BasicModel[] BASIC_MODELS = { + new BasicModelBE(), /* TODO: enable new BasicModelD(), */ new BasicModelG(), + new BasicModelIF(), new BasicModelIn(), new BasicModelIne(), + /* TODO: enable new BasicModelP() */ + }; + /** The DFR aftereffects to test. */ + static AfterEffect[] AFTER_EFFECTS = { + new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect() + }; + /** The DFR normalizations to test. */ + static Normalization[] NORMALIZATIONS = { + new NormalizationH1(), new NormalizationH2(), + new NormalizationH3(), new NormalizationZ() + // TODO: if we enable NoNormalization, we have to deal with + // a couple tests (e.g. TestDocBoost, TestSort) that expect length normalization + // new Normalization.NoNormalization() + }; + /** The distributions for IB. */ + static Distribution[] DISTRIBUTIONS = { + new DistributionLL(), new DistributionSPL() + }; + /** Lambdas for IB. */ + static Lambda[] LAMBDAS = { + new LambdaDF(), new LambdaTTF() + }; + static List allSims; + static { + allSims = new ArrayList(); + allSims.add(new DefaultSimilarity()); + allSims.add(new BM25Similarity()); + for (BasicModel basicModel : BASIC_MODELS) { + for (AfterEffect afterEffect : AFTER_EFFECTS) { + for (Normalization normalization : NORMALIZATIONS) { + allSims.add(new DFRSimilarity(basicModel, afterEffect, normalization)); + } + } + } + for (Distribution distribution : DISTRIBUTIONS) { + for (Lambda lambda : LAMBDAS) { + for (Normalization normalization : NORMALIZATIONS) { + allSims.add(new IBSimilarity(distribution, lambda, normalization)); + } + } + } + /* TODO: enable Dirichlet + allSims.add(new LMDirichletSimilarity()); */ + allSims.add(new LMJelinekMercerSimilarity(0.1f)); + allSims.add(new LMJelinekMercerSimilarity(0.7f)); + } + + @Override + public synchronized String toString() { + return "RandomSimilarityProvider(queryNorm=" + shouldQueryNorm + ",coord=" + shouldCoord + "): " + previousMappings.toString(); + } +} diff --git a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java index 8bcbb6f3a14..1dea2be32cc 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java @@ -52,6 +52,8 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.search.AssertingIndexSearcher; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.RandomSimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FlushInfo; @@ -209,6 +211,8 @@ public abstract class LuceneTestCase extends Assert { private static Codec codec; // default codec provider private static CodecProvider savedCodecProvider; + + private static SimilarityProvider similarityProvider; private static Locale locale; private static Locale savedLocale; @@ -393,6 +397,7 @@ public abstract class LuceneTestCase extends Assert { savedTimeZone = TimeZone.getDefault(); timeZone = TEST_TIMEZONE.equals("random") ? randomTimeZone(random) : TimeZone.getTimeZone(TEST_TIMEZONE); TimeZone.setDefault(timeZone); + similarityProvider = new RandomSimilarityProvider(random); testsFailed = false; } @@ -467,6 +472,7 @@ public abstract class LuceneTestCase extends Assert { /** print some useful debugging information about the environment */ private static void printDebuggingInformation(String codecDescription) { System.err.println("NOTE: test params are: codec=" + codecDescription + + ", sim=" + similarityProvider + ", locale=" + locale + ", timezone=" + (timeZone == null ? "(null)" : timeZone.getID())); System.err.println("NOTE: all tests run in this JVM:"); @@ -922,6 +928,7 @@ public abstract class LuceneTestCase extends Assert { /** create a new index writer config with random defaults using the specified random */ public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) { IndexWriterConfig c = new IndexWriterConfig(v, a); + c.setSimilarityProvider(similarityProvider); if (r.nextBoolean()) { c.setMergeScheduler(new SerialMergeScheduler()); } @@ -1249,7 +1256,9 @@ public abstract class LuceneTestCase extends Assert { if (maybeWrap && rarely()) { r = new SlowMultiReaderWrapper(r); } - return random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext()); + IndexSearcher ret = random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext()); + ret.setSimilarityProvider(similarityProvider); + return ret; } else { int threads = 0; final ExecutorService ex = (random.nextBoolean()) ? null @@ -1258,7 +1267,7 @@ public abstract class LuceneTestCase extends Assert { if (ex != null && VERBOSE) { System.out.println("NOTE: newSearcher using ExecutorService with " + threads + " threads"); } - return random.nextBoolean() ? + IndexSearcher ret = random.nextBoolean() ? new AssertingIndexSearcher(r, ex) { @Override public void close() throws IOException { @@ -1272,6 +1281,8 @@ public abstract class LuceneTestCase extends Assert { shutdownExecutorService(ex); } }; + ret.setSimilarityProvider(similarityProvider); + return ret; } } diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index e294397233f..4e35e817fd3 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -35,13 +35,13 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; diff --git a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java index ada1e4abb82..9d578679c66 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -27,11 +27,11 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java index 76005c2ad23..0ea80a3f746 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java @@ -25,7 +25,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index c45569ef5f7..1e3269cd649 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -40,9 +40,9 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java index 42cf8c07cc4..570ee843fbb 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java @@ -17,7 +17,7 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java index c9cd59728e2..00b75754627 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java @@ -29,10 +29,10 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java index bc0f114160a..0c12f1c147e 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java @@ -23,10 +23,11 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java index f981a1505e0..1e55007450f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java @@ -35,11 +35,13 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitVector; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java index 6a3c16b5dea..816977e60a6 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java @@ -26,8 +26,8 @@ import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; diff --git a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java index a40c1eeafea..c58b14fc031 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java +++ b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java @@ -27,9 +27,9 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; diff --git a/lucene/src/test/org/apache/lucene/index/TestNorms.java b/lucene/src/test/org/apache/lucene/index/TestNorms.java index ba686ea81a6..98cd28c301a 100755 --- a/lucene/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestNorms.java @@ -27,10 +27,10 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java index 92e522aef90..8882e4103d5 100644 --- a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java +++ b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java @@ -32,6 +32,9 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.store.Directory; diff --git a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java index d584ca4953a..76ec8bb9423 100644 --- a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -234,7 +235,8 @@ public class TestParallelReader extends LuceneTestCase { w.addDocument(d2); w.close(); - return new IndexSearcher(dir, false); + IndexReader ir = IndexReader.open(dir, false); + return newSearcher(ir); } // Fields 1 & 2 in one index, 3 & 4 in other, with ParallelReader: diff --git a/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java index 568da06dc0a..dbab2bc0dd3 100644 --- a/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java +++ b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java @@ -25,9 +25,9 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; diff --git a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java index a76dbdc1958..f1f49bce5e1 100644 --- a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -20,9 +20,11 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.Similarity.ExactDocScorer; -import org.apache.lucene.search.Similarity.SloppyDocScorer; -import org.apache.lucene.search.Similarity.Stats; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.Similarity.ExactDocScorer; +import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer; +import org.apache.lucene.search.similarities.Similarity.Stats; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TermContext; import org.apache.lucene.index.FieldInvertState; diff --git a/lucene/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/src/test/org/apache/lucene/search/TestBoolean2.java index 5afd53602bf..84369dc2de6 100644 --- a/lucene/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/src/test/org/apache/lucene/search/TestBoolean2.java @@ -26,6 +26,8 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.MockDirectoryWrapper; diff --git a/lucene/src/test/org/apache/lucene/search/TestBooleanQuery.java b/lucene/src/test/org/apache/lucene/search/TestBooleanQuery.java index e4d82391635..1f0726441e4 100644 --- a/lucene/src/test/org/apache/lucene/search/TestBooleanQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestBooleanQuery.java @@ -27,6 +27,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.NamedThreadFactory; @@ -72,6 +75,21 @@ public class TestBooleanQuery extends LuceneTestCase { IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); + // this test relies upon coord being the default implementation, + // otherwise scores are different! + final SimilarityProvider delegate = s.getSimilarityProvider(); + s.setSimilarityProvider(new DefaultSimilarityProvider() { + @Override + public float queryNorm(float sumOfSquaredWeights) { + return delegate.queryNorm(sumOfSquaredWeights); + } + + @Override + public Similarity get(String field) { + return delegate.get(field); + } + }); + BooleanQuery q = new BooleanQuery(); q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); @@ -81,7 +99,7 @@ public class TestBooleanQuery extends LuceneTestCase { subQuery.setBoost(0); q.add(subQuery, BooleanClause.Occur.SHOULD); float score2 = s.search(q, 10).getMaxScore(); - assertEquals(score*.5, score2, 1e-6); + assertEquals(score*.5F, score2, 1e-6); // LUCENE-2617: make sure that a clause not in the index still contributes to the score via coord factor BooleanQuery qq = (BooleanQuery)q.clone(); @@ -91,14 +109,14 @@ public class TestBooleanQuery extends LuceneTestCase { phrase.setBoost(0); qq.add(phrase, BooleanClause.Occur.SHOULD); score2 = s.search(qq, 10).getMaxScore(); - assertEquals(score*(1.0/3), score2, 1e-6); + assertEquals(score*(1/3F), score2, 1e-6); // now test BooleanScorer2 subQuery = new TermQuery(new Term("field", "b")); subQuery.setBoost(0); q.add(subQuery, BooleanClause.Occur.MUST); score2 = s.search(q, 10).getMaxScore(); - assertEquals(score*(2.0/3), score2, 1e-6); + assertEquals(score*(2/3F), score2, 1e-6); // PhraseQuery w/ no terms added returns a null scorer PhraseQuery pq = new PhraseQuery(); diff --git a/lucene/src/test/org/apache/lucene/search/TestComplexExplanations.java b/lucene/src/test/org/apache/lucene/search/TestComplexExplanations.java index eeaa4ff4c8d..4e0411f3158 100644 --- a/lucene/src/test/org/apache/lucene/search/TestComplexExplanations.java +++ b/lucene/src/test/org/apache/lucene/search/TestComplexExplanations.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.search.spans.*; /** diff --git a/lucene/src/test/org/apache/lucene/search/TestConstantScoreQuery.java b/lucene/src/test/org/apache/lucene/search/TestConstantScoreQuery.java index b1d567b38fe..3ee0f8efd81 100644 --- a/lucene/src/test/org/apache/lucene/search/TestConstantScoreQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestConstantScoreQuery.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java index 171e035db2c..bdad0677609 100644 --- a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java @@ -29,6 +29,10 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import java.text.DecimalFormat; diff --git a/lucene/src/test/org/apache/lucene/search/TestDocBoost.java b/lucene/src/test/org/apache/lucene/search/TestDocBoost.java index e087e107e1b..a92ce7831b1 100644 --- a/lucene/src/test/org/apache/lucene/search/TestDocBoost.java +++ b/lucene/src/test/org/apache/lucene/search/TestDocBoost.java @@ -56,7 +56,8 @@ public class TestDocBoost extends LuceneTestCase { final float[] scores = new float[4]; - newSearcher(reader).search + IndexSearcher searcher = newSearcher(reader); + searcher.search (new TermQuery(new Term("field", "word")), new Collector() { private int base = 0; @@ -82,7 +83,10 @@ public class TestDocBoost extends LuceneTestCase { float lastScore = 0.0f; for (int i = 0; i < 2; i++) { - assertTrue(scores[i] > lastScore); + if (VERBOSE) { + System.out.println(searcher.explain(new TermQuery(new Term("field", "word")), i)); + } + assertTrue("score: " + scores[i] + " should be > lastScore: " + lastScore, scores[i] > lastScore); lastScore = scores[i]; } diff --git a/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java index bb4630f01dc..30990141ab2 100644 --- a/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java +++ b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java @@ -30,6 +30,9 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -71,13 +74,24 @@ public class TestDocValuesScoring extends LuceneTestCase { // no boosting IndexSearcher searcher1 = newSearcher(ir); + final SimilarityProvider base = searcher1.getSimilarityProvider(); // boosting IndexSearcher searcher2 = newSearcher(ir); - searcher2.setSimilarityProvider(new DefaultSimilarityProvider() { - final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost"); + searcher2.setSimilarityProvider(new SimilarityProvider() { + final Similarity fooSim = new BoostingSimilarity(base.get("foo"), "foo_boost"); public Similarity get(String field) { - return "foo".equals(field) ? fooSim : super.get(field); + return "foo".equals(field) ? fooSim : base.get(field); + } + + @Override + public float coord(int overlap, int maxOverlap) { + return base.coord(overlap, maxOverlap); + } + + @Override + public float queryNorm(float sumOfSquaredWeights) { + return base.queryNorm(sumOfSquaredWeights); } }); diff --git a/lucene/src/test/org/apache/lucene/search/TestElevationComparator.java b/lucene/src/test/org/apache/lucene/search/TestElevationComparator.java index cf7458152d5..ba2f176d452 100644 --- a/lucene/src/test/org/apache/lucene/search/TestElevationComparator.java +++ b/lucene/src/test/org/apache/lucene/search/TestElevationComparator.java @@ -23,6 +23,7 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.*; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.FieldValueHitQueue.Entry; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.*; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; @@ -41,7 +42,8 @@ public class TestElevationComparator extends LuceneTestCase { directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)). setMaxBufferedDocs(2). - setMergePolicy(newLogMergePolicy(1000)) + setMergePolicy(newLogMergePolicy(1000)). + setSimilarityProvider(new DefaultSimilarityProvider()) ); writer.addDocument(adoc(new String[] {"id", "a", "title", "ipod", "str_s", "a"})); writer.addDocument(adoc(new String[] {"id", "b", "title", "ipod ipod", "str_s", "b"})); @@ -54,6 +56,7 @@ public class TestElevationComparator extends LuceneTestCase { writer.close(); IndexSearcher searcher = newSearcher(r); + searcher.setSimilarityProvider(new DefaultSimilarityProvider()); runTest(searcher, true); runTest(searcher, false); diff --git a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java index f75f1914041..81faa28d593 100644 --- a/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java +++ b/lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java @@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -104,6 +107,21 @@ public class TestFuzzyQuery2 extends LuceneTestCase { if (VERBOSE) { System.out.println("TEST: searcher=" + searcher); } + // even though this uses a boost-only rewrite, this test relies upon queryNorm being the default implementation, + // otherwise scores are different! + final SimilarityProvider delegate = searcher.getSimilarityProvider(); + searcher.setSimilarityProvider(new DefaultSimilarityProvider() { + @Override + public float coord(int overlap, int maxOverlap) { + return delegate.coord(overlap, maxOverlap); + } + + @Override + public Similarity get(String field) { + return delegate.get(field); + } + }); + writer.close(); String line; while ((line = reader.readLine()) != null) { diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index a02629f0077..5ef3a09b701 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -37,6 +37,9 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java b/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java index 0b42d34d842..0f07ab42890 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java @@ -26,6 +26,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -169,6 +172,19 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter { // test for correct application of query normalization // must use a non score normalizing method for this. + + final SimilarityProvider delegate = search.getSimilarityProvider(); + search.setSimilarityProvider(new DefaultSimilarityProvider() { + @Override + public float coord(int overlap, int maxOverlap) { + return delegate.coord(overlap, maxOverlap); + } + + @Override + public Similarity get(String field) { + return delegate.get(field); + } + }); Query q = csrq("data", "1", "6", T, T); q.setBoost(100); search.search(q, null, new Collector() { diff --git a/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java index 4a296db2a59..3cb9a282b76 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.*; import org.apache.lucene.util.Version; import org.apache.lucene.util._TestUtil; @@ -342,7 +343,10 @@ public class TestPhraseQuery extends LuceneTestCase { public void testSlopScoring() throws IOException { Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMergePolicy(newLogMergePolicy()) + .setSimilarityProvider(new DefaultSimilarityProvider())); Document doc = new Document(); doc.add(newField("field", "foo firstname lastname foo", TextField.TYPE_STORED)); @@ -360,6 +364,7 @@ public class TestPhraseQuery extends LuceneTestCase { writer.close(); IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarityProvider(new DefaultSimilarityProvider()); PhraseQuery query = new PhraseQuery(); query.add(new Term("field", "firstname")); query.add(new Term("field", "lastname")); diff --git a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java index 3a8efe15fc7..8d286890bb2 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java +++ b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; /** Document boost unit test. diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java index f62601eea85..4b249493515 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java @@ -27,6 +27,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java index 9e0878be388..9a0b908a20c 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java @@ -27,6 +27,9 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiNorms; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java index c7076c2542a..eb0a217bf85 100755 --- a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -17,6 +17,8 @@ package org.apache.lucene.search; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -25,6 +27,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -68,9 +71,9 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { */ public void testDoc1_Query1_All_Slops_Should_match() throws Exception { for (int slop=0; slop<30; slop++) { - float score1 = checkPhraseQuery(DOC_1, QUERY_1, slop, 1); - float score2 = checkPhraseQuery(DOC_1_B, QUERY_1, slop, 1); - assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + float freq1 = checkPhraseQuery(DOC_1, QUERY_1, slop, 1); + float freq2 = checkPhraseQuery(DOC_1_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" freq2="+freq2+" should be greater than score1 "+freq1, freq2>freq1); } } @@ -82,10 +85,10 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { public void testDoc2_Query1_Slop_6_or_more_Should_match() throws Exception { for (int slop=0; slop<30; slop++) { int numResultsExpected = slop<6 ? 0 : 1; - float score1 = checkPhraseQuery(DOC_2, QUERY_1, slop, numResultsExpected); + float freq1 = checkPhraseQuery(DOC_2, QUERY_1, slop, numResultsExpected); if (numResultsExpected>0) { - float score2 = checkPhraseQuery(DOC_2_B, QUERY_1, slop, 1); - assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + float freq2 = checkPhraseQuery(DOC_2_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" freq2="+freq2+" should be greater than freq1 "+freq1, freq2>freq1); } } } @@ -97,9 +100,9 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { */ public void testDoc2_Query2_All_Slops_Should_match() throws Exception { for (int slop=0; slop<30; slop++) { - float score1 = checkPhraseQuery(DOC_2, QUERY_2, slop, 1); - float score2 = checkPhraseQuery(DOC_2_B, QUERY_2, slop, 1); - assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + float freq1 = checkPhraseQuery(DOC_2, QUERY_2, slop, 1); + float freq2 = checkPhraseQuery(DOC_2_B, QUERY_2, slop, 1); + assertTrue("slop="+slop+" freq2="+freq2+" should be greater than freq1 "+freq1, freq2>freq1); } } @@ -109,9 +112,9 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { */ public void testDoc3_Query1_All_Slops_Should_match() throws Exception { for (int slop=0; slop<30; slop++) { - float score1 = checkPhraseQuery(DOC_3, QUERY_1, slop, 1); - float score2 = checkPhraseQuery(DOC_3_B, QUERY_1, slop, 1); - assertTrue("slop="+slop+" score2="+score2+" should be greater than score1 "+score1, score2>score1); + float freq1 = checkPhraseQuery(DOC_3, QUERY_1, slop, 1); + float freq2 = checkPhraseQuery(DOC_3_B, QUERY_1, slop, 1); + assertTrue("slop="+slop+" freq2="+freq2+" should be greater than freq1 "+freq1, freq2>freq1); } } @@ -140,9 +143,9 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); - TopDocs td = searcher.search(query,null,10); - //System.out.println("slop: "+slop+" query: "+query+" doc: "+doc+" Expecting number of hits: "+expectedNumResults+" maxScore="+td.getMaxScore()); - assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, td.totalHits); + MaxFreqCollector c = new MaxFreqCollector(); + searcher.search(query, c); + assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, c.totalHits); //QueryUtils.check(query,searcher); writer.close(); @@ -150,7 +153,9 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { reader.close(); ramDir.close(); - return td.getMaxScore(); + // returns the max Scorer.freq() found, because even though norms are omitted, many index stats are different + // with these different tokens/distributions/lengths.. otherwise this test is very fragile. + return c.max; } private static Document makeDocument(String docText) { @@ -171,4 +176,29 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { return query; } + static class MaxFreqCollector extends Collector { + float max; + int totalHits; + Scorer scorer; + + @Override + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + } + + @Override + public void collect(int doc) throws IOException { + totalHits++; + max = Math.max(max, scorer.freq()); + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + } } diff --git a/lucene/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/src/test/org/apache/lucene/search/TestTermScorer.java index 100c442f4c3..22b91351d77 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/src/test/org/apache/lucene/search/TestTermScorer.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SlowMultiReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -47,7 +48,10 @@ public class TestTermScorer extends LuceneTestCase { super.setUp(); directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMergePolicy(newLogMergePolicy()) + .setSimilarityProvider(new DefaultSimilarityProvider())); for (int i = 0; i < values.length; i++) { Document doc = new Document(); doc @@ -57,6 +61,7 @@ public class TestTermScorer extends LuceneTestCase { indexReader = new SlowMultiReaderWrapper(writer.getReader()); writer.close(); indexSearcher = newSearcher(indexReader); + indexSearcher.setSimilarityProvider(new DefaultSimilarityProvider()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/TestTermVectors.java b/lucene/src/test/org/apache/lucene/search/TestTermVectors.java index c848d59bcf1..d2555bf3681 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTermVectors.java +++ b/lucene/src/test/org/apache/lucene/search/TestTermVectors.java @@ -28,6 +28,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; import org.apache.lucene.index.*; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; @@ -244,7 +245,9 @@ public class TestTermVectors extends LuceneTestCase { RandomIndexWriter writer = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.SIMPLE, true)) - .setOpenMode(OpenMode.CREATE).setMergePolicy(newLogMergePolicy())); + .setOpenMode(OpenMode.CREATE) + .setMergePolicy(newLogMergePolicy()) + .setSimilarityProvider(new DefaultSimilarityProvider())); writer.addDocument(testDoc1); writer.addDocument(testDoc2); writer.addDocument(testDoc3); @@ -252,6 +255,7 @@ public class TestTermVectors extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); IndexSearcher knownSearcher = newSearcher(reader); + knownSearcher.setSimilarityProvider(new DefaultSimilarityProvider()); FieldsEnum fields = MultiFields.getFields(knownSearcher.reader).iterator(); DocsEnum docs = null; diff --git a/lucene/src/test/org/apache/lucene/search/payloads/PayloadHelper.java b/lucene/src/test/org/apache/lucene/search/payloads/PayloadHelper.java index 1c33f7acc28..d799ca65df2 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/PayloadHelper.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/PayloadHelper.java @@ -29,7 +29,7 @@ import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadExplanations.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadExplanations.java index bc64e05e306..5dbe6466e38 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadExplanations.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadExplanations.java @@ -18,9 +18,9 @@ package org.apache.lucene.search.payloads; */ import org.apache.lucene.index.Term; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.TestExplanations; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java index 82564abcfdf..0b216bcfd87 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java @@ -27,14 +27,14 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Payload; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.QueryUtils; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanTermQuery; diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java index f8c6329f9d2..8a102eda658 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java @@ -20,17 +20,17 @@ import org.apache.lucene.analysis.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.English; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.QueryUtils; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.CheckHits; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.search.spans.MultiSpansWrapper; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.Spans; diff --git a/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java b/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java new file mode 100644 index 00000000000..ab0ea1843da --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/similarities/SpoofIndexSearcher.java @@ -0,0 +1,216 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.Map; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** + * Index searcher implementation that takes an {@link BasicStats} instance and + * returns statistics accordingly. Most of the methods are not implemented, so + * it can only be used for Similarity unit testing. + */ +public class SpoofIndexSearcher extends IndexSearcher { + public SpoofIndexSearcher(BasicStats stats) { + super(new SpoofIndexReader(stats)); + } + + public static class SpoofIndexReader extends IndexReader { + /** The stats the reader has to return. */ + protected BasicStats stats; + /** The fields the reader has to return. */ + protected SpoofFields fields; + + public SpoofIndexReader(BasicStats stats) { + this.stats = stats; + this.fields = new SpoofFields(stats); + } + + @Override + public int numDocs() { + return stats.getNumberOfDocuments(); + } + + @Override + public int maxDoc() { + return stats.getNumberOfDocuments(); + } + + @Override + public Fields fields() throws IOException { + return fields; + } + + @Override + public Collection getFieldNames(FieldOption fldOption) { + return Arrays.asList(new String[]{"spoof"}); + } + + @Override + public ReaderContext getTopReaderContext() { + return new AtomicReaderContext(this); + } + + @Override + public boolean hasDeletions() { + return false; + } + + // ------------------------ Not implemented methods ------------------------ + + @Override + public TermFreqVector[] getTermFreqVectors(int docNumber) + throws IOException { + return null; + } + + @Override + public TermFreqVector getTermFreqVector(int docNumber, String field) + throws IOException { + return null; + } + + @Override + public void getTermFreqVector(int docNumber, String field, + TermVectorMapper mapper) throws IOException { + } + + @Override + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) + throws IOException { + } + + @Override + public void document(int docID, StoredFieldVisitor visitor) throws CorruptIndexException, IOException { + } + + @Override + public byte[] norms(String field) throws IOException { + return null; + } + + @Override + protected void doSetNorm(int doc, String field, byte value) + throws CorruptIndexException, IOException { + } + + @Override + public PerDocValues perDocValues() throws IOException { + return null; + } + + @Override + protected void doDelete(int docNum) throws CorruptIndexException, + IOException { + } + + @Override + protected void doUndeleteAll() throws CorruptIndexException, IOException { + } + + @Override + protected void doCommit(Map commitUserData) + throws IOException { + } + + @Override + protected void doClose() throws IOException { + } + + @Override + public Bits getLiveDocs() { + return null; + } + } + + /** Spoof Fields class for Similarity testing. */ + public static class SpoofFields extends Fields { + /** The stats the object has to return. */ + protected SpoofTerms terms; + + public SpoofFields(BasicStats stats) { + this.terms = new SpoofTerms(stats); + } + + @Override + public Terms terms(String field) throws IOException { + return terms; + } + + // ------------------------ Not implemented methods ------------------------ + + @Override + public FieldsEnum iterator() throws IOException { + return null; + } + } + + /** Spoof Terms class for Similarity testing. */ + public static class SpoofTerms extends Terms { + /** The stats the object has to return. */ + protected BasicStats stats; + + public SpoofTerms(BasicStats stats) { + this.stats = stats; + } + + @Override + public long getSumTotalTermFreq() throws IOException { + return stats.getNumberOfFieldTokens(); + } + + @Override + public long getSumDocFreq() throws IOException { + return stats.getDocFreq(); + } + + @Override + public int getDocCount() throws IOException { + return stats.getDocFreq(); + } + + // ------------------------ Not implemented methods ------------------------ + + @Override + public TermsEnum iterator() throws IOException { + return null; + } + + @Override + public Comparator getComparator() throws IOException { + return null; + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java new file mode 100644 index 00000000000..110db076495 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java @@ -0,0 +1,218 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Tests against all the similarities we have + */ +public class TestSimilarity2 extends LuceneTestCase { + List simProviders; + + @Override + public void setUp() throws Exception { + super.setUp(); + simProviders = new ArrayList(); + simProviders.add(new BasicSimilarityProvider(new DefaultSimilarity())); + simProviders.add(new BasicSimilarityProvider(new BM25Similarity())); + // TODO: not great that we dup this all with TestSimilarityBase + for (BasicModel basicModel : TestSimilarityBase.BASIC_MODELS) { + for (AfterEffect afterEffect : TestSimilarityBase.AFTER_EFFECTS) { + for (Normalization normalization : TestSimilarityBase.NORMALIZATIONS) { + simProviders.add(new BasicSimilarityProvider(new DFRSimilarity(basicModel, afterEffect, normalization))); + } + } + } + for (Distribution distribution : TestSimilarityBase.DISTRIBUTIONS) { + for (Lambda lambda : TestSimilarityBase.LAMBDAS) { + for (Normalization normalization : TestSimilarityBase.NORMALIZATIONS) { + simProviders.add(new BasicSimilarityProvider(new IBSimilarity(distribution, lambda, normalization))); + } + } + } + simProviders.add(new BasicSimilarityProvider(new LMDirichletSimilarity())); + simProviders.add(new BasicSimilarityProvider(new LMJelinekMercerSimilarity(0.1f))); + simProviders.add(new BasicSimilarityProvider(new LMJelinekMercerSimilarity(0.7f))); + } + + /** because of stupid things like querynorm, its possible we computeStats on a field that doesnt exist at all + * test this against a totally empty index, to make sure sims handle it + */ + public void testEmptyIndex() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + assertEquals(0, is.search(new TermQuery(new Term("foo", "bar")), 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } + + /** similar to the above, but ORs the query with a real field */ + public void testEmptyField() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + doc.add(newField("foo", "bar", TextField.TYPE_UNSTORED)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + BooleanQuery query = new BooleanQuery(true); + query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("bar", "baz")), BooleanClause.Occur.SHOULD); + assertEquals(1, is.search(query, 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } + + /** similar to the above, however the field exists, but we query with a term that doesnt exist too */ + public void testEmptyTerm() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + doc.add(newField("foo", "bar", TextField.TYPE_UNSTORED)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + BooleanQuery query = new BooleanQuery(true); + query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD); + assertEquals(1, is.search(query, 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } + + /** make sure we can retrieve when norms are disabled */ + public void testNoNorms() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + FieldType ft = new FieldType(TextField.TYPE_UNSTORED); + ft.setOmitNorms(true); + ft.freeze(); + doc.add(newField("foo", "bar", ft)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + BooleanQuery query = new BooleanQuery(true); + query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); + assertEquals(1, is.search(query, 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } + + /** make sure all sims work if TF is omitted */ + public void testOmitTF() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + FieldType ft = new FieldType(TextField.TYPE_UNSTORED); + ft.setIndexOptions(IndexOptions.DOCS_ONLY); + ft.freeze(); + Field f = newField("foo", "bar", ft); + doc.add(f); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + BooleanQuery query = new BooleanQuery(true); + query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); + assertEquals(1, is.search(query, 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } + + /** make sure all sims work if TF and norms is omitted */ + public void testOmitTFAndNorms() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + FieldType ft = new FieldType(TextField.TYPE_UNSTORED); + ft.setIndexOptions(IndexOptions.DOCS_ONLY); + ft.setOmitNorms(true); + ft.freeze(); + Field f = newField("foo", "bar", ft); + doc.add(f); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + BooleanQuery query = new BooleanQuery(true); + query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); + assertEquals(1, is.search(query, 10).totalHits); + } + is.close(); + ir.close(); + dir.close(); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java new file mode 100644 index 00000000000..4f50880302e --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -0,0 +1,587 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.OrdTermState; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; +import org.junit.Ignore; + +/** + * Tests the {@link SimilarityBase}-based Similarities. Contains unit tests and + * integration tests for all Similarities and correctness tests for a select + * few. + *

This class maintains a list of + * {@code SimilarityBase} subclasses. Each test case performs its test on all + * items in the list. If a test case fails, the name of the Similarity that + * caused the failure is returned as part of the assertion error message.

+ *

Unit testing is performed by constructing statistics manually and calling + * the {@link SimilarityBase#score(BasicStats, float, int)} method of the + * Similarities. The statistics represent corner cases of corpus distributions. + *

+ *

For the integration tests, a small (8-document) collection is indexed. The + * tests verify that for a specific query, all relevant documents are returned + * in the correct order. The collection consists of two poems of English poet + * William Blake.

+ *

Note: the list of Similarities is maintained by hand. If a new Similarity + * is added to the {@code org.apache.lucene.search.similarities} package, the + * list should be updated accordingly.

+ *

+ * In the correctness tests, the score is verified against the result of manual + * computation. Since it would be impossible to test all Similarities + * (e.g. all possible DFR combinations, all parameter values for LM), only + * the best performing setups in the original papers are verified. + *

+ */ +public class TestSimilarityBase extends LuceneTestCase { + private static String FIELD_BODY = "body"; + private static String FIELD_ID = "id"; + /** The tolerance range for float equality. */ + private static float FLOAT_EPSILON = 1e-5f; + /** The DFR basic models to test. */ + static BasicModel[] BASIC_MODELS = { + new BasicModelBE(), new BasicModelD(), new BasicModelG(), + new BasicModelIF(), new BasicModelIn(), new BasicModelIne(), + new BasicModelP() + }; + /** The DFR aftereffects to test. */ + static AfterEffect[] AFTER_EFFECTS = { + new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect() + }; + /** The DFR normalizations to test. */ + static Normalization[] NORMALIZATIONS = { + new NormalizationH1(), new NormalizationH2(), new NormalizationH3(), + new NormalizationZ(), new Normalization.NoNormalization() + }; + /** The distributions for IB. */ + static Distribution[] DISTRIBUTIONS = { + new DistributionLL(), new DistributionSPL() + }; + /** Lambdas for IB. */ + static Lambda[] LAMBDAS = { + new LambdaDF(), new LambdaTTF() + }; + + private IndexSearcher searcher; + private Directory dir; + private IndexReader reader; + /** The list of similarities to test. */ + private List sims; + + @Override + public void setUp() throws Exception { + super.setUp(); + + dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); + + for (int i = 0; i < docs.length; i++) { + Document d = new Document(); + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setIndexed(false); + d.add(newField(FIELD_ID, Integer.toString(i), ft)); + d.add(newField(FIELD_BODY, docs[i], TextField.TYPE_STORED)); + writer.addDocument(d); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + + sims = new ArrayList(); + for (BasicModel basicModel : BASIC_MODELS) { + for (AfterEffect afterEffect : AFTER_EFFECTS) { + for (Normalization normalization : NORMALIZATIONS) { + sims.add(new DFRSimilarity(basicModel, afterEffect, normalization)); + } + } + } + for (Distribution distribution : DISTRIBUTIONS) { + for (Lambda lambda : LAMBDAS) { + for (Normalization normalization : NORMALIZATIONS) { + sims.add(new IBSimilarity(distribution, lambda, normalization)); + } + } + } + sims.add(new LMDirichletSimilarity()); + sims.add(new LMJelinekMercerSimilarity(0.1f)); + sims.add(new LMJelinekMercerSimilarity(0.7f)); + } + + // ------------------------------- Unit tests -------------------------------- + + /** The default number of documents in the unit tests. */ + private static int NUMBER_OF_DOCUMENTS = 100; + /** The default total number of tokens in the field in the unit tests. */ + private static long NUMBER_OF_FIELD_TOKENS = 5000; + /** The default average field length in the unit tests. */ + private static float AVG_FIELD_LENGTH = 50; + /** The default document frequency in the unit tests. */ + private static int DOC_FREQ = 10; + /** + * The default total number of occurrences of this term across all documents + * in the unit tests. + */ + private static long TOTAL_TERM_FREQ = 70; + + /** The default tf in the unit tests. */ + private static float FREQ = 7; + /** The default document length in the unit tests. */ + private static int DOC_LEN = 40; + + /** Creates the default statistics object that the specific tests modify. */ + private BasicStats createStats() { + BasicStats stats = new BasicStats(1); + stats.setNumberOfDocuments(NUMBER_OF_DOCUMENTS); + stats.setNumberOfFieldTokens(NUMBER_OF_FIELD_TOKENS); + stats.setAvgFieldLength(AVG_FIELD_LENGTH); + stats.setDocFreq(DOC_FREQ); + stats.setTotalTermFreq(TOTAL_TERM_FREQ); + return stats; + } + + /** + * The generic test core called by all unit test methods. It calls the + * {@link SimilarityBase#score(BasicStats, float, int)} method of all + * Similarities in {@link #sims} and checks if the score is valid; i.e. it + * is a finite positive real number. + */ + private void unitTestCore(BasicStats stats, float freq, int docLen) + throws IOException { + // We have to fake everything, because computeStats() can be overridden and + // there is no way to inject false data after fillBasicStats(). + SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats); + TermContext tc = new TermContext( + searcher.getIndexReader().getTopReaderContext(), + new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq()); + + for (SimilarityBase sim : sims) { + BasicStats realStats = (BasicStats) sim.computeStats(new SpoofIndexSearcher(stats), + "spoof", stats.getTotalBoost(), tc); + float score = sim.score(realStats, freq, docLen); + float explScore = sim.explain( + realStats, 1, new Explanation(freq, "freq"), docLen).getValue(); + assertFalse("Score infinite: " + sim.toString(), Float.isInfinite(score)); + assertFalse("Score NaN: " + sim.toString(), Float.isNaN(score)); + assertTrue("Score negative: " + sim.toString(), score >= 0); + assertEquals("score() and explain() return different values: " + + sim.toString(), score, explScore, FLOAT_EPSILON); + } + } + + /** Runs the unit test with the default statistics. */ + public void testDefault() throws IOException { + unitTestCore(createStats(), FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code numberOfDocuments = numberOfFieldTokens}. + */ + public void testSparseDocuments() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfFieldTokens(stats.getNumberOfDocuments()); + stats.setTotalTermFreq(stats.getDocFreq()); + stats.setAvgFieldLength( + (float)stats.getNumberOfFieldTokens() / stats.getNumberOfDocuments()); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code numberOfDocuments > numberOfFieldTokens}. + */ + public void testVerySparseDocuments() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfFieldTokens(stats.getNumberOfDocuments() * 2 / 3); + stats.setTotalTermFreq(stats.getDocFreq()); + stats.setAvgFieldLength( + (float)stats.getNumberOfFieldTokens() / stats.getNumberOfDocuments()); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code NumberOfDocuments = 1}. + */ + public void testOneDocument() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfDocuments(1); + stats.setNumberOfFieldTokens(DOC_LEN); + stats.setAvgFieldLength(DOC_LEN); + stats.setDocFreq(1); + stats.setTotalTermFreq((int)FREQ); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code docFreq = numberOfDocuments}. + */ + public void testAllDocumentsRelevant() throws IOException { + BasicStats stats = createStats(); + float mult = (0.0f + stats.getNumberOfDocuments()) / stats.getDocFreq(); + stats.setTotalTermFreq((int)(stats.getTotalTermFreq() * mult)); + stats.setDocFreq(stats.getNumberOfDocuments()); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code docFreq > numberOfDocuments / 2}. + */ + public void testMostDocumentsRelevant() throws IOException { + BasicStats stats = createStats(); + float mult = (0.6f * stats.getNumberOfDocuments()) / stats.getDocFreq(); + stats.setTotalTermFreq((int)(stats.getTotalTermFreq() * mult)); + stats.setDocFreq((int)(stats.getNumberOfDocuments() * 0.6)); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code docFreq = 1}. + */ + public void testOnlyOneRelevantDocument() throws IOException { + BasicStats stats = createStats(); + stats.setDocFreq(1); + stats.setTotalTermFreq((int)FREQ + 3); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code totalTermFreq = numberOfFieldTokens}. + */ + public void testAllTermsRelevant() throws IOException { + BasicStats stats = createStats(); + stats.setTotalTermFreq(stats.getNumberOfFieldTokens()); + unitTestCore(stats, DOC_LEN, DOC_LEN); + stats.setAvgFieldLength(DOC_LEN + 10); + unitTestCore(stats, DOC_LEN, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code totalTermFreq > numberOfDocuments}. + */ + public void testMoreTermsThanDocuments() throws IOException { + BasicStats stats = createStats(); + stats.setTotalTermFreq( + stats.getTotalTermFreq() + stats.getNumberOfDocuments()); + unitTestCore(stats, 2 * FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when + * {@code totalTermFreq = numberOfDocuments}. + */ + public void testNumberOfTermsAsDocuments() throws IOException { + BasicStats stats = createStats(); + stats.setTotalTermFreq(stats.getNumberOfDocuments()); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when {@code totalTermFreq = 1}. + */ + public void testOneTerm() throws IOException { + BasicStats stats = createStats(); + stats.setDocFreq(1); + stats.setTotalTermFreq(1); + unitTestCore(stats, 1, DOC_LEN); + } + + /** + * Tests correct behavior when {@code totalTermFreq = freq}. + */ + public void testOneRelevantDocument() throws IOException { + BasicStats stats = createStats(); + stats.setDocFreq(1); + stats.setTotalTermFreq((int)FREQ); + unitTestCore(stats, FREQ, DOC_LEN); + } + + /** + * Tests correct behavior when {@code numberOfFieldTokens = freq}. + */ + public void testAllTermsRelevantOnlyOneDocument() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfDocuments(10); + stats.setNumberOfFieldTokens(50); + stats.setAvgFieldLength(5); + stats.setDocFreq(1); + stats.setTotalTermFreq(50); + unitTestCore(stats, 50, 50); + } + + /** + * Tests correct behavior when there is only one document with a single term + * in the collection. + */ + public void testOnlyOneTermOneDocument() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfDocuments(1); + stats.setNumberOfFieldTokens(1); + stats.setAvgFieldLength(1); + stats.setDocFreq(1); + stats.setTotalTermFreq(1); + unitTestCore(stats, 1, 1); + } + + /** + * Tests correct behavior when there is only one term in the field, but + * more than one documents. + */ + public void testOnlyOneTerm() throws IOException { + BasicStats stats = createStats(); + stats.setNumberOfFieldTokens(1); + stats.setAvgFieldLength(1.0f / stats.getNumberOfDocuments()); + stats.setDocFreq(1); + stats.setTotalTermFreq(1); + unitTestCore(stats, 1, DOC_LEN); + } + + /** + * Tests correct behavior when {@code avgFieldLength = docLen}. + */ + public void testDocumentLengthAverage() throws IOException { + BasicStats stats = createStats(); + unitTestCore(stats, FREQ, (int)stats.getAvgFieldLength()); + } + + // ---------------------------- Correctness tests ---------------------------- + + /** Correctness test for the Dirichlet LM model. */ + public void testLMDirichlet() throws IOException { + float p = + (FREQ + 2000.0f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)) / + (DOC_LEN + 2000.0f); + float a = 2000.0f / (DOC_LEN + 2000.0f); + float gold = (float)( + Math.log(p / (a * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))) + + Math.log(a)); + correctnessTestCore(new LMDirichletSimilarity(), gold); + } + + /** Correctness test for the Jelinek-Mercer LM model. */ + public void testLMJelinekMercer() throws IOException { + float p = (1 - 0.1f) * FREQ / DOC_LEN + + 0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f); + float gold = (float)(Math.log( + p / (0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)))); + correctnessTestCore(new LMJelinekMercerSimilarity(0.1f), gold); + } + + /** + * Correctness test for the LL IB model with DF-based lambda and + * no normalization. + */ + public void testLLForIB() throws IOException { + SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization()); + correctnessTestCore(sim, 4.26267987704f); + } + + /** + * Correctness test for the SPL IB model with TTF-based lambda and + * no normalization. + */ + public void testSPLForIB() throws IOException { + SimilarityBase sim = + new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization()); + correctnessTestCore(sim, 2.24069910825f); + } + + /** Correctness test for the PL2 DFR model. */ + public void testPL2() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelP(), new AfterEffectL(), new NormalizationH2()); + float tfn = (float)(FREQ * SimilarityBase.log2( + 1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101 + float l = 1.0f / (tfn + 1.0f); // 0.108820144666 + float lambda = (1.0f * TOTAL_TERM_FREQ) / NUMBER_OF_DOCUMENTS; // 0.7 + float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) + + (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) + + 0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.1113611585 + float gold = l * p; // 2.29734137536 + correctnessTestCore(sim, gold); + } + + /** Correctness test for the IneB2 DFR model. */ + public void testIneB2() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); + correctnessTestCore(sim, 6.23455315685f); + } + + /** Correctness test for the GL1 DFR model. */ + public void testGL1() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelG(), new AfterEffectL(), new NormalizationH1()); + correctnessTestCore(sim, 1.6463143825531006f); + } + + /** Correctness test for the BEB1 DFR model. */ + public void testBEB1() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelBE(), new AfterEffectB(), new NormalizationH1()); + float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75 + float b = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (tfn + 1)); // 0.728205128205 + float f = TOTAL_TERM_FREQ + tfn; + float n = f + NUMBER_OF_DOCUMENTS; + float n1 = n + f - 1; // 256.5 + float m1 = n + f - tfn - 2; // 246.75 + float n2 = f; // 78.75 + float m2 = f - tfn; // 70.0 + float be = (float)(-SimilarityBase.log2(n - 1) - + SimilarityBase.log2(Math.E) + // -8.916400790508378 + ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) + + (n1 - m1) * SimilarityBase.log2(n1)) - // 91.85089272283668 + ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) + + (n2 - m2) * SimilarityBase.log2(n2))); // 67.09778276257171 + // 15.836709 + float gold = b * be; // 11.532373 + correctnessTestCore(sim, gold); + } + + /** Correctness test for the D DFR model (basic model only). */ + public void testD() throws IOException { + SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization()); + double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ; + double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099 + double phi = FREQ / totalTermFreqNorm; // 0.09090909090909091 + double D = phi * SimilarityBase.log2(phi / p) + // 0.17884523239871358 + (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p)); + float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2( + 1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.449575 + correctnessTestCore(sim, gold); + } + + /** Correctness test for the In2 DFR model with no aftereffect. */ + public void testIn2() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2()); + float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101 + 1 + AVG_FIELD_LENGTH / DOC_LEN)); + float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898 + (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5))); + correctnessTestCore(sim, gold); + } + + /** Correctness test for the IFB DFR model with no normalization. */ + public void testIFB() throws IOException { + SimilarityBase sim = new DFRSimilarity( + new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization()); + float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875 + float IF = (float)(FREQ * SimilarityBase.log2( // 8.97759389642 + 1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5))); + float gold = B * IF; // 7.96761458307 + correctnessTestCore(sim, gold); + } + + /** + * The generic test core called by all correctness test methods. It calls the + * {@link SimilarityBase#score(BasicStats, float, int)} method of all + * Similarities in {@link #sims} and compares the score against the manually + * computed {@code gold}. + */ + private void correctnessTestCore(SimilarityBase sim, float gold) + throws IOException { + // We have to fake everything, because computeStats() can be overridden and + // there is no way to inject false data after fillBasicStats(). + BasicStats stats = createStats(); + SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats); + TermContext tc = new TermContext( + searcher.getIndexReader().getTopReaderContext(), + new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq()); + + BasicStats realStats = (BasicStats) sim.computeStats( + searcher, "spoof", stats.getTotalBoost(), tc); + float score = sim.score(realStats, FREQ, DOC_LEN); + assertEquals( + sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON); + } + + // ---------------------------- Integration tests ---------------------------- + + /** The "collection" for the integration tests. */ + String[] docs = new String[] { + "Tiger, tiger burning bright In the forest of the night What immortal hand or eye Could frame thy fearful symmetry ?", + "In what distant depths or skies Burnt the fire of thine eyes ? On what wings dare he aspire ? What the hands the seize the fire ?", + "And what shoulder and what art Could twist the sinews of thy heart ? And when thy heart began to beat What dread hand ? And what dread feet ?", + "What the hammer? What the chain ? In what furnace was thy brain ? What the anvil ? And what dread grasp Dare its deadly terrors clasp ?", + "And when the stars threw down their spears And water'd heaven with their tear Did he smile his work to see ? Did he, who made the lamb, made thee ?", + "Tiger, tiger burning bright In the forest of the night What immortal hand or eye Dare frame thy fearful symmetry ?", + "Cruelty has a human heart And jealousy a human face Terror the human form divine And Secrecy the human dress .", + "The human dress is forg'd iron The human form a fiery forge The human face a furnace seal'd The human heart its fiery gorge ." + }; + + /** + * Tests whether all similarities return three documents for the query word + * "heart". + */ + public void testHeartList() throws IOException { + Query q = new TermQuery(new Term(FIELD_BODY, "heart")); + + for (SimilarityBase sim : sims) { + searcher.setSimilarityProvider(new BasicSimilarityProvider(sim)); + TopDocs topDocs = searcher.search(q, 1000); + assertEquals("Failed: " + sim.toString(), 3, topDocs.totalHits); + } + } + + /** Test whether all similarities return document 3 before documents 7 and 8. */ + public void testHeartRanking() throws IOException { + assumeFalse("PreFlex codec does not support the stats necessary for this test!", + "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec())); + + Query q = new TermQuery(new Term(FIELD_BODY, "heart")); + + for (SimilarityBase sim : sims) { + searcher.setSimilarityProvider(new BasicSimilarityProvider(sim)); + TopDocs topDocs = searcher.search(q, 1000); + assertEquals("Failed: " + sim.toString(), 2, topDocs.scoreDocs[0].doc); + } + } + + @Override + public void tearDown() throws Exception { + searcher.close(); + reader.close(); + dir.close(); + super.tearDown(); + } +} diff --git a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java index 9b2ee4b5289..8611056d316 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java +++ b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java @@ -22,7 +22,7 @@ import java.util.Collection; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; /** * Holds all implementations of classes in the o.a.l.s.spans package as a diff --git a/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java b/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java index 6661eca1dfb..ed6d5b7b02d 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java +++ b/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java @@ -37,13 +37,13 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Payload; import org.apache.lucene.index.Term; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.payloads.PayloadHelper; import org.apache.lucene.search.payloads.PayloadSpanUtil; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java b/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java index c222ceb4758..573e18e19ce 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java +++ b/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java @@ -17,17 +17,17 @@ package org.apache.lucene.search.spans; * limitations under the License. */ -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Query; import org.apache.lucene.search.CheckHits; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader.ReaderContext; diff --git a/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java b/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java index 97591b7c1f9..6e18cf62c9e 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java +++ b/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; import org.apache.lucene.store.Directory; /******************************************************************************* @@ -57,10 +58,10 @@ public class TestSpansAdvanced extends LuceneTestCase { super.setUp(); // create test index mDirectory = newDirectory(); - final RandomIndexWriter writer = new RandomIndexWriter(random, - mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, - new MockAnalyzer(random, MockTokenizer.SIMPLE, true, - MockTokenFilter.ENGLISH_STOPSET, true)).setMergePolicy(newLogMergePolicy())); + final RandomIndexWriter writer = new RandomIndexWriter(random, mDirectory, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) + .setMergePolicy(newLogMergePolicy()).setSimilarityProvider(new DefaultSimilarityProvider())); addDocument(writer, "1", "I think it should work."); addDocument(writer, "2", "I think it should work."); addDocument(writer, "3", "I think it should work."); @@ -68,6 +69,7 @@ public class TestSpansAdvanced extends LuceneTestCase { reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); + searcher.setSimilarityProvider(new DefaultSimilarityProvider()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java b/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java index 288b062976e..00828201e15 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java +++ b/lucene/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; /******************************************************************************* * Some expanded tests to make sure my patch doesn't break other SpanTermQuery @@ -48,7 +49,8 @@ public class TestSpansAdvanced2 extends TestSpansAdvanced { final RandomIndexWriter writer = new RandomIndexWriter(random, mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) - .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())); + .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy()) + .setSimilarityProvider(new DefaultSimilarityProvider())); addDocument(writer, "A", "Should we, could we, would we?"); addDocument(writer, "B", "It should. Should it?"); addDocument(writer, "C", "It shouldn't."); @@ -58,6 +60,7 @@ public class TestSpansAdvanced2 extends TestSpansAdvanced { // re-open the searcher since we added more docs searcher2 = newSearcher(reader2); + searcher2.setSimilarityProvider(new DefaultSimilarityProvider()); } @Override diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java index b6a53416f37..259ddaa883c 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java @@ -21,8 +21,8 @@ import org.apache.lucene.index.*; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java index f2b5436bb6f..81fa80eadff 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java @@ -22,8 +22,8 @@ import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import java.io.IOException; import java.util.Map; diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java index 90b605bc25c..42bfb1cac90 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java @@ -23,8 +23,8 @@ import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; diff --git a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 9a777090f66..5c776585447 100644 --- a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -24,6 +24,8 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; diff --git a/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/StandardBooleanQueryNodeBuilder.java b/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/StandardBooleanQueryNodeBuilder.java index 04616bed4bc..5a4b09ebe43 100644 --- a/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/StandardBooleanQueryNodeBuilder.java +++ b/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/StandardBooleanQueryNodeBuilder.java @@ -31,8 +31,8 @@ import org.apache.lucene.queryparser.flexible.standard.parser.EscapeQuerySyntaxI import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.BooleanQuery.TooManyClauses; +import org.apache.lucene.search.similarities.SimilarityProvider; /** * This builder does the same as the {@link BooleanQueryNodeBuilder}, but this diff --git a/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/StandardBooleanQueryNode.java b/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/StandardBooleanQueryNode.java index b72dd045198..aceb2810ba5 100644 --- a/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/StandardBooleanQueryNode.java +++ b/modules/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/StandardBooleanQueryNode.java @@ -22,7 +22,7 @@ import java.util.List; import org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; /** * A {@link StandardBooleanQueryNode} has the same behavior as diff --git a/solr/client/ruby/solr-ruby/solr/conf/schema.xml b/solr/client/ruby/solr-ruby/solr/conf/schema.xml index bbd360ad4d8..daac616b121 100755 --- a/solr/client/ruby/solr-ruby/solr/conf/schema.xml +++ b/solr/client/ruby/solr-ruby/solr/conf/schema.xml @@ -217,6 +217,6 @@ - + diff --git a/solr/client/ruby/solr-ruby/test/conf/schema.xml b/solr/client/ruby/solr-ruby/test/conf/schema.xml index fafe6c18e5e..a284b29ddce 100755 --- a/solr/client/ruby/solr-ruby/test/conf/schema.xml +++ b/solr/client/ruby/solr-ruby/test/conf/schema.xml @@ -235,6 +235,6 @@ - + diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 681c4ba8472..0babcf289d9 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -27,10 +27,10 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.Query; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; diff --git a/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java b/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java index eaf32da412f..b763aa8d134 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java @@ -18,7 +18,7 @@ package org.apache.solr.schema; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.Version; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.SolrException; diff --git a/solr/core/src/java/org/apache/solr/schema/IndexSchema.java b/solr/core/src/java/org/apache/solr/schema/IndexSchema.java index 90c27564e04..4901c637da8 100644 --- a/solr/core/src/java/org/apache/solr/schema/IndexSchema.java +++ b/solr/core/src/java/org/apache/solr/schema/IndexSchema.java @@ -20,9 +20,9 @@ package org.apache.solr.schema; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.util.Version; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.SolrException; diff --git a/solr/core/src/java/org/apache/solr/schema/SimilarityFactory.java b/solr/core/src/java/org/apache/solr/schema/SimilarityFactory.java index 25b03a0ad68..e7a73dcf84d 100644 --- a/solr/core/src/java/org/apache/solr/schema/SimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/schema/SimilarityFactory.java @@ -16,7 +16,7 @@ package org.apache.solr.schema; * limitations under the License. */ -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.solr.common.params.SolrParams; public abstract class SimilarityFactory { diff --git a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java index ca71cebe263..14ca0a106d6 100644 --- a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -19,6 +19,7 @@ package org.apache.solr.search; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.OpenBitSet; diff --git a/solr/core/src/java/org/apache/solr/search/SolrSimilarityProvider.java b/solr/core/src/java/org/apache/solr/search/SolrSimilarityProvider.java index 2fe437fd999..c68f6bddb4b 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrSimilarityProvider.java +++ b/solr/core/src/java/org/apache/solr/search/SolrSimilarityProvider.java @@ -17,8 +17,8 @@ package org.apache.solr.search; * limitations under the License. */ -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; diff --git a/solr/core/src/test-files/solr/conf/schema-copyfield-test.xml b/solr/core/src/test-files/solr/conf/schema-copyfield-test.xml index d294af661c2..9e4716f5473 100644 --- a/solr/core/src/test-files/solr/conf/schema-copyfield-test.xml +++ b/solr/core/src/test-files/solr/conf/schema-copyfield-test.xml @@ -468,6 +468,6 @@ A custom similarity may be specified here, but the default is fine for most applications. --> - + diff --git a/solr/core/src/test-files/solr/conf/schema-required-fields.xml b/solr/core/src/test-files/solr/conf/schema-required-fields.xml index f17948476f7..c5dec556a7f 100644 --- a/solr/core/src/test-files/solr/conf/schema-required-fields.xml +++ b/solr/core/src/test-files/solr/conf/schema-required-fields.xml @@ -434,6 +434,6 @@ A custom similarity may be specified here, but the default is fine for most applications. --> - + diff --git a/solr/core/src/test/org/apache/solr/schema/CustomSimilarityFactory.java b/solr/core/src/test/org/apache/solr/schema/CustomSimilarityFactory.java index 98267cce285..782bf6b1705 100644 --- a/solr/core/src/test/org/apache/solr/schema/CustomSimilarityFactory.java +++ b/solr/core/src/test/org/apache/solr/schema/CustomSimilarityFactory.java @@ -16,7 +16,7 @@ */ package org.apache.solr.schema; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.Similarity; public class CustomSimilarityFactory extends SimilarityFactory { @Override diff --git a/solr/core/src/test/org/apache/solr/schema/IndexSchemaTest.java b/solr/core/src/test/org/apache/solr/schema/IndexSchemaTest.java index 737cb90ffc1..5a0a9f04e4d 100644 --- a/solr/core/src/test/org/apache/solr/schema/IndexSchemaTest.java +++ b/solr/core/src/test/org/apache/solr/schema/IndexSchemaTest.java @@ -27,7 +27,7 @@ import org.apache.solr.common.params.MapSolrParams; import org.apache.solr.core.SolrCore; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.junit.BeforeClass; import org.junit.Test; diff --git a/solr/core/src/test/org/apache/solr/schema/MockConfigurableSimilarity.java b/solr/core/src/test/org/apache/solr/schema/MockConfigurableSimilarity.java index 36021fab4e7..74394c4377d 100644 --- a/solr/core/src/test/org/apache/solr/schema/MockConfigurableSimilarity.java +++ b/solr/core/src/test/org/apache/solr/schema/MockConfigurableSimilarity.java @@ -16,7 +16,7 @@ */ package org.apache.solr.schema; -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; public class MockConfigurableSimilarity extends DefaultSimilarity { private String passthrough; diff --git a/solr/core/src/test/org/apache/solr/schema/TestPerFieldSimilarity.java b/solr/core/src/test/org/apache/solr/schema/TestPerFieldSimilarity.java index 3aa81c30c84..1ab50255a30 100644 --- a/solr/core/src/test/org/apache/solr/schema/TestPerFieldSimilarity.java +++ b/solr/core/src/test/org/apache/solr/schema/TestPerFieldSimilarity.java @@ -18,8 +18,8 @@ package org.apache.solr.schema; */ import org.apache.lucene.misc.SweetSpotSimilarity; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; diff --git a/solr/core/src/test/org/apache/solr/search/function/TestFunctionQuery.java b/solr/core/src/test/org/apache/solr/search/function/TestFunctionQuery.java index f15363c621d..774cbf703d6 100755 --- a/solr/core/src/test/org/apache/solr/search/function/TestFunctionQuery.java +++ b/solr/core/src/test/org/apache/solr/search/function/TestFunctionQuery.java @@ -19,9 +19,9 @@ package org.apache.solr.search.function; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.codecs.CodecProvider; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.TFIDFSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; diff --git a/solr/example/example-DIH/solr/db/conf/schema.xml b/solr/example/example-DIH/solr/db/conf/schema.xml index 24718d300a2..f110064d953 100644 --- a/solr/example/example-DIH/solr/db/conf/schema.xml +++ b/solr/example/example-DIH/solr/db/conf/schema.xml @@ -354,6 +354,6 @@ - + diff --git a/solr/example/solr/conf/schema.xml b/solr/example/solr/conf/schema.xml index dc5ae9a90c8..b17500aea7a 100755 --- a/solr/example/solr/conf/schema.xml +++ b/solr/example/solr/conf/schema.xml @@ -615,7 +615,7 @@ - +