diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2d577d09af9..abd9e84054c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -495,6 +495,34 @@ New features * LUCENE-3423: add Terms.getDocCount(), which returns the number of documents that have at least one term for a field. (Yonik Seeley, Robert Muir) +* LUCENE-2959: Added a variety of different relevance ranking systems to Lucene. + + - Added Okapi BM25, Language Models, Divergence from Randomness, and + Information-Based Models. The models are pluggable, support all of lucene's + features (boosts, slops, explanations, etc) and queries (spans, etc). + + - All models default to the same index-time norm encoding as DefaultSimilarity: + so you can easily try these out/switch back and forth/run experiments and + comparisons without reindexing. Note: most of the models do rely upon index + statistics that are new in Lucene 4.0, so for existing 3.x indexes its a good + idea to upgrade your index to the new format with IndexUpgrader first. + + - Added a new subclass SimilarityBase which provides a simplified API + for plugging in new ranking algorithms without dealing with all of the + nuances and implementation details of Lucene. + + - Added a new helper class BasicSimilarityProvider that just applies one + scoring algorithm to all fields, with queryNorm() and coord() returning 1. + In general, it is recommended to disable coord() when using the new models. + For example, to use BM25 for all fields: + searcher.setSimilarityProvider(new BasicSimilarityProvider(new BM25Similarity())); + + If you instead want to apply different similarities (e.g. ones with different + parameter values or different algorithms entirely) to different fields, implement + SimilarityProvider with your per-field logic. + + (David Mark Nemeskey via Robert Muir) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index 01512b149a4..d989eaf264f 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -43,7 +43,7 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.AttributeImpl; diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 7d6e050b60c..09275a5c8bc 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -57,8 +57,8 @@ import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.RAMDirectory; // for javadocs import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java index 4322f0ad333..133a29c7710 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java @@ -22,9 +22,9 @@ import java.util.Date; import java.util.List; import java.util.ArrayList; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index f1ac1459532..f3321eff818 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -17,7 +17,7 @@ package org.apache.lucene.misc; -import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.index.FieldInvertState; /** diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java index ac28283f061..080e2fecd13 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java @@ -26,13 +26,13 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java index 0e9732c4a91..d1457b0b3e4 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java @@ -18,11 +18,11 @@ package org.apache.lucene.misc; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; -import org.apache.lucene.search.SimilarityProvider; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.FieldInvertState; diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java index 6a18043cddc..da4e704e905 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestLengthNormModifier.java @@ -31,13 +31,13 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MultiNorms; import org.apache.lucene.index.Term; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarityProvider; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index bef95219193..04942968f68 100644 --- a/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/contrib/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -31,6 +31,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; diff --git a/lucene/src/java/org/apache/lucene/document/Field.java b/lucene/src/java/org/apache/lucene/document/Field.java index f82c9adc3c7..c24cba94c67 100644 --- a/lucene/src/java/org/apache/lucene/document/Field.java +++ b/lucene/src/java/org/apache/lucene/document/Field.java @@ -223,14 +223,14 @@ public class Field implements IndexableField { * document. * *
The boost is used to compute the norm factor for the field. By
- * default, in the {@link org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method,
+ * default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)} method,
* the boost value is multiplied by the length normalization factor and then
- * rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
+ * rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
- * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
- * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
+ * @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)
+ * @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)
*/
public void setBoost(float boost) {
this.boost = boost;
diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
index 945503a6310..a63b430e542 100644
--- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
@@ -32,7 +32,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.DocumentsWriterPerThreadPool.ThreadState;
import org.apache.lucene.index.FieldInfos.FieldNumberBiMap;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.SimilarityProvider;
+import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
index 09983bbff2a..4eec2723b05 100644
--- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@@ -26,7 +26,7 @@ import java.text.NumberFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice;
-import org.apache.lucene.search.SimilarityProvider;
+import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java
index a25b9d82b00..e94b3afd24e 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java
@@ -32,7 +32,7 @@ import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.index.values.IndexDocValues;
import org.apache.lucene.search.FieldCache; // javadocs
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
@@ -1012,7 +1012,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
*
* @see #norms(String)
* @see Similarity#computeNorm(FieldInvertState)
- * @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
+ * @see org.apache.lucene.search.similarities.DefaultSimilarity#decodeNormValue(byte)
* @throws StaleReaderException if the index has changed
* since this reader was opened
* @throws CorruptIndexException if the index is corrupt
diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 700dbb07145..326a1d8da5e 100644
--- a/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -22,7 +22,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.SimilarityProvider;
+import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.util.Version;
/**
diff --git a/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java
index 9a0612c7139..825bc55abd7 100644
--- a/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java
+++ b/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java
@@ -17,7 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
/** Taps into DocInverter, as an InvertedDocEndConsumer,
diff --git a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
index 902ae69c3d1..5cc537f69bc 100644
--- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
@@ -24,7 +24,8 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs;
-import org.apache.lucene.search.Similarity.ExactDocScorer;
+import org.apache.lucene.search.similarities.SimilarityProvider;
+import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.search.TermQuery.TermWeight;
import java.io.IOException;
diff --git a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
index e707627b3c7..7f7d53df709 100644
--- a/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
+++ b/lucene/src/java/org/apache/lucene/search/BooleanScorer2.java
@@ -24,6 +24,7 @@ import java.util.List;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.BooleanWeight;
+import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.Scorer.ChildScorer;
/* See the description in BooleanScorer.java, comparing
diff --git a/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java b/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java
index caf21e23551..b0a464ef302 100644
--- a/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/ConjunctionTermScorer.java
@@ -18,7 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.search.Similarity.ExactDocScorer;
+import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
import java.util.Comparator;
diff --git a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index 08cf2c330d4..0a0454b7096 100644
--- a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.*;
+import org.apache.lucene.search.similarities.Similarity;
final class ExactPhraseScorer extends Scorer {
private final int endMinus1;
diff --git a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java
index efe7cf130e2..d7537a81196 100644
--- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -38,6 +38,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Weight.ScorerContext;
+import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
+import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory; // javadoc
import org.apache.lucene.util.ReaderUtil;
diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index 1a65b65ee31..47232cc42f9 100644
--- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -26,7 +26,8 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.search.Similarity.SloppyDocScorer;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
@@ -164,8 +165,7 @@ public class MultiPhraseQuery extends Query {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
- if (termArrays.size() == 0) // optimize zero-term case
- return null;
+ assert !termArrays.isEmpty();
final IndexReader reader = context.reader;
final Bits liveDocs = reader.getLiveDocs();
@@ -249,7 +249,11 @@ public class MultiPhraseQuery extends Query {
@Override
public Query rewrite(IndexReader reader) {
- if (termArrays.size() == 1) { // optimize one-term case
+ if (termArrays.isEmpty()) {
+ BooleanQuery bq = new BooleanQuery();
+ bq.setBoost(getBoost());
+ return bq;
+ } else if (termArrays.size() == 1) { // optimize one-term case
Term[] terms = termArrays.get(0);
BooleanQuery boq = new BooleanQuery(true);
for (int i=0; i Most basic models use the number of documents and the total term
+ * frequency to compute Inf1. This method provides a generic
+ * explanation for such models. Subclasses that use other statistics must
+ * override this method.
+ * WARNING: for terms that do not meet the expected random distribution
+ * (e.g. stopwords), this model may give poor performance, such as
+ * abnormally high scores for low tf values.
+ * @lucene.experimental
+ */
+public class BasicModelD extends BasicModel {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
+ // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
+ // to create a 'normalized' F.
+ double F = stats.getTotalTermFreq() + tfn;
+ double phi = (double)tfn / F;
+ double nphi = 1 - phi;
+ double p = 1.0 / (stats.getNumberOfDocuments() + 1);
+ double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
+ return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi));
+ }
+
+ @Override
+ public String toString() {
+ return "D";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java
new file mode 100644
index 00000000000..edd50b0f00f
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java
@@ -0,0 +1,41 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.SimilarityBase.log2;
+
+/**
+ * Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs
+ * slightly from the one in the original paper: {@code F} is increased by {@code tfn}
+ * and {@code N} is increased by {@code F}.
+ * @lucene.experimental
+ */
+public class BasicModelG extends BasicModel {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
+ double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq());
+ // -log(1 / (lambda + 1)) -> log(lambda + 1)
+ return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
+ }
+
+ @Override
+ public String toString() {
+ return "G";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java
new file mode 100644
index 00000000000..3cef323d11c
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java
@@ -0,0 +1,38 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.SimilarityBase.log2;
+
+/**
+ * An approximation of the I(ne) model.
+ * @lucene.experimental
+ */
+public class BasicModelIF extends BasicModel {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ int N = stats.getNumberOfDocuments();
+ long F = stats.getTotalTermFreq();
+ return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
+ }
+
+ @Override
+ public String toString() {
+ return "I(F)";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java
new file mode 100644
index 00000000000..a61222e5075
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java
@@ -0,0 +1,52 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+import static org.apache.lucene.search.similarities.SimilarityBase.log2;
+
+/**
+ * The basic tf-idf model of randomness.
+ * @lucene.experimental
+ */
+public class BasicModelIn extends BasicModel {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ int N = stats.getNumberOfDocuments();
+ int n = stats.getDocFreq();
+ return tfn * (float)(log2((N + 1) / (n + 0.5)));
+ }
+
+ @Override
+ public final Explanation explain(BasicStats stats, float tfn) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(score(stats, tfn));
+ result.addDetail(new Explanation(tfn, "tfn"));
+ result.addDetail(
+ new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
+ result.addDetail(
+ new Explanation(stats.getDocFreq(), "docFreq"));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "I(n)";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java
new file mode 100644
index 00000000000..cdbdeb4edd1
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java
@@ -0,0 +1,40 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.SimilarityBase.log2;
+
+/**
+ * Tf-idf model of randomness, based on a mixture of Poisson and inverse
+ * document frequency.
+ * @lucene.experimental
+ */
+public class BasicModelIne extends BasicModel {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ int N = stats.getNumberOfDocuments();
+ long F = stats.getTotalTermFreq();
+ double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
+ return tfn * (float)(log2((N + 1) / (ne + 0.5)));
+ }
+
+ @Override
+ public String toString() {
+ return "I(ne)";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java
new file mode 100644
index 00000000000..41a88232ec2
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java
@@ -0,0 +1,46 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.search.similarities.SimilarityBase.log2;
+
+/**
+ * Implements the Poisson approximation for the binomial model for DFR.
+ * @lucene.experimental
+ *
+ * WARNING: for terms that do not meet the expected random distribution
+ * (e.g. stopwords), this model may give poor performance, such as
+ * abnormally high scores for low tf values.
+ */
+public class BasicModelP extends BasicModel {
+ /** {@code log2(Math.E)}, precomputed. */
+ protected static double LOG2_E = log2(Math.E);
+
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
+ return (float)(tfn * log2(tfn / lambda)
+ + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
+ + 0.5 * log2(2 * Math.PI * tfn));
+ }
+
+ @Override
+ public String toString() {
+ return "P";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
new file mode 100644
index 00000000000..ac2e191d104
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
@@ -0,0 +1,54 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A simple {@link Similarity} provider that returns in
+ * {@code get(String field)} the object passed to its constructor. This class
+ * is aimed at non-VSM models, and therefore both the {@link #coord} and
+ * {@link #queryNorm} methods return {@code 1}. Use
+ * {@link DefaultSimilarityProvider} for {@link DefaultSimilarity}.
+ * @lucene.experimental
+ */
+public class BasicSimilarityProvider implements SimilarityProvider {
+ private final Similarity sim;
+
+ public BasicSimilarityProvider(Similarity sim) {
+ this.sim = sim;
+ }
+
+ @Override
+ public float coord(int overlap, int maxOverlap) {
+ return 1f;
+ }
+
+ @Override
+ public float queryNorm(float sumOfSquaredWeights) {
+ return 1f;
+ }
+
+ @Override
+ public Similarity get(String field) {
+ return sim;
+ }
+
+ @Override
+ public String toString() {
+ return "BasicSimilarityProvider(" + sim + ")";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java
new file mode 100644
index 00000000000..a96e7a0aa74
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java
@@ -0,0 +1,144 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.Terms;
+
+/**
+ * Stores all statistics commonly used ranking methods.
+ * @lucene.experimental
+ */
+public class BasicStats extends Similarity.Stats {
+ /** The number of documents. */
+ protected int numberOfDocuments;
+ /** The total number of tokens in the field. */
+ protected long numberOfFieldTokens;
+ /** The average field length. */
+ protected float avgFieldLength;
+ /** The document frequency. */
+ protected int docFreq;
+ /** The total number of occurrences of this term across all documents. */
+ protected long totalTermFreq;
+
+ // -------------------------- Boost-related stuff --------------------------
+
+ /** Query's inner boost. */
+ protected final float queryBoost;
+ /** Any outer query's boost. */
+ protected float topLevelBoost;
+ /** For most Similarities, the immediate and the top level query boosts are
+ * not handled differently. Hence, this field is just the product of the
+ * other two. */
+ protected float totalBoost;
+
+ /** Constructor. Sets the query boost. */
+ public BasicStats(float queryBoost) {
+ this.queryBoost = queryBoost;
+ this.totalBoost = queryBoost;
+ }
+
+ // ------------------------- Getter/setter methods -------------------------
+
+ /** Returns the number of documents. */
+ public int getNumberOfDocuments() {
+ return numberOfDocuments;
+ }
+
+ /** Sets the number of documents. */
+ public void setNumberOfDocuments(int numberOfDocuments) {
+ this.numberOfDocuments = numberOfDocuments;
+ }
+
+ /**
+ * Returns the total number of tokens in the field.
+ * @see Terms#getSumTotalTermFreq()
+ */
+ public long getNumberOfFieldTokens() {
+ return numberOfFieldTokens;
+ }
+
+ /**
+ * Sets the total number of tokens in the field.
+ * @see Terms#getSumTotalTermFreq()
+ */
+ public void setNumberOfFieldTokens(long numberOfFieldTokens) {
+ this.numberOfFieldTokens = numberOfFieldTokens;
+ }
+
+ /** Returns the average field length. */
+ public float getAvgFieldLength() {
+ return avgFieldLength;
+ }
+
+ /** Sets the average field length. */
+ public void setAvgFieldLength(float avgFieldLength) {
+ this.avgFieldLength = avgFieldLength;
+ }
+
+ /** Returns the document frequency. */
+ public int getDocFreq() {
+ return docFreq;
+ }
+
+ /** Sets the document frequency. */
+ public void setDocFreq(int docFreq) {
+ this.docFreq = docFreq;
+ }
+
+ /** Returns the total number of occurrences of this term across all documents. */
+ public long getTotalTermFreq() {
+ return totalTermFreq;
+ }
+
+ /** Sets the total number of occurrences of this term across all documents. */
+ public void setTotalTermFreq(long totalTermFreq) {
+ this.totalTermFreq = totalTermFreq;
+ }
+
+ // -------------------------- Boost-related stuff --------------------------
+
+ /** The square of the raw normalization value.
+ * @see #rawNormalizationValue() */
+ @Override
+ public float getValueForNormalization() {
+ float rawValue = rawNormalizationValue();
+ return rawValue * rawValue;
+ }
+
+ /** Computes the raw normalization value. This basic implementation returns
+ * the query boost. Subclasses may override this method to include other
+ * factors (such as idf), or to save the value for inclusion in
+ * {@link #normalize(float, float)}, etc.
+ */
+ protected float rawNormalizationValue() {
+ return queryBoost;
+ }
+
+ /** No normalization is done. {@code topLevelBoost} is saved in the object,
+ * however. */
+ @Override
+ public void normalize(float queryNorm, float topLevelBoost) {
+ this.topLevelBoost = topLevelBoost;
+ totalBoost = queryBoost * topLevelBoost;
+ }
+
+ /** Returns the total boost. */
+ public float getTotalBoost() {
+ return totalBoost;
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
new file mode 100644
index 00000000000..6e3039687cc
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
@@ -0,0 +1,86 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Implements the divergence from randomness (DFR) framework
+ * introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002.
+ * Probabilistic models of information retrieval based on measuring the
+ * divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002),
+ * 357-389.
+ * The DFR scoring formula is composed of three separate components: the
+ * basic model, the aftereffect and an additional
+ * normalization component, represented by the classes
+ * {@code BasicModel}, {@code AfterEffect} and {@code Normalization},
+ * respectively. The names of these classes were chosen to match the names of
+ * their counterparts in the Terrier IR engine. Note that qtf, the multiplicity of term-occurrence in the query,
+ * is not handled by this implementation. Unlike for DFR, the natural logarithm is used, as
+ * it is faster to compute and the original paper does not express any
+ * preference to a specific base. Unlike for DFR, the natural logarithm is used, as
+ * it is faster to compute and the original paper does not express any
+ * preference to a specific base. The retrieval function is of the form RSV(q, d) = ∑
+ * -xqw log Prob(Xw ≥
+ * tdw | λw), where
+ * Scorer
for documents matching a Term
.
*/
diff --git a/lucene/src/java/org/apache/lucene/search/Weight.java b/lucene/src/java/org/apache/lucene/search/Weight.java
index e99c5a6b5cb..94607cb212e 100644
--- a/lucene/src/java/org/apache/lucene/search/Weight.java
+++ b/lucene/src/java/org/apache/lucene/search/Weight.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
+import org.apache.lucene.search.similarities.SimilarityProvider;
/**
* Expert: Calculate query weights and build query scorers.
diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java
index c5f8900a4ff..a90e0d955f5 100644
--- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java
@@ -22,10 +22,10 @@ import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.Weight;
-import org.apache.lucene.search.Similarity.SloppyDocScorer;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery;
@@ -52,7 +52,7 @@ import java.util.Iterator;
*
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
*
- * @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
+ * @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
*/
public class PayloadNearQuery extends SpanNearQuery {
protected String fieldName;
diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java
index 0df4220a4a8..bd8e23978bc 100644
--- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java
@@ -20,16 +20,16 @@ package org.apache.lucene.search.payloads;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation;
-import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
@@ -49,7 +49,7 @@ import java.io.IOException;
* which returns 1 by default.
*
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
- * @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
+ * @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
**/
public class PayloadTermQuery extends SpanTermQuery {
protected PayloadFunction function;
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java
new file mode 100644
index 00000000000..4610f2bd7b3
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffect.java
@@ -0,0 +1,63 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * This class acts as the base class for the implementations of the first
+ * normalization of the informative content in the DFR framework. This
+ * component is also called the after effect and is defined by the
+ * formula Inf2 = 1 - Prob2, where
+ * Prob2 measures the information gain.
+ *
+ * @see DFRSimilarity
+ * @lucene.experimental
+ */
+public abstract class AfterEffect {
+ /** Returns the aftereffect score. */
+ public abstract float score(BasicStats stats, float tfn);
+
+ /** Returns an explanation for the score. */
+ public abstract Explanation explain(BasicStats stats, float tfn);
+
+ /** Implementation used when there is no aftereffect. */
+ public static final class NoAfterEffect extends AfterEffect {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ return 1f;
+ }
+
+ @Override
+ public final Explanation explain(BasicStats stats, float tfn) {
+ return new Explanation(1, "no aftereffect");
+ }
+
+ @Override
+ public String toString() {
+ return "";
+ }
+ }
+
+ /**
+ * Subclasses must override this method to return the code of the
+ * after effect formula. Refer to the original paper for the list.
+ */
+ @Override
+ public abstract String toString();
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
new file mode 100644
index 00000000000..b1f4320043c
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Model of the information gain based on the ratio of two Bernoulli processes.
+ * @lucene.experimental
+ */
+public class AfterEffectB extends AfterEffect {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ long F = stats.getTotalTermFreq();
+ int n = stats.getDocFreq();
+ return (F + 1) / (n * (tfn + 1));
+ }
+
+ @Override
+ public final Explanation explain(BasicStats stats, float tfn) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(score(stats, tfn));
+ result.addDetail(new Explanation(tfn, "tfn"));
+ result.addDetail(new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
+ result.addDetail(new Explanation(stats.getDocFreq(), "docFreq"));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "B";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java
new file mode 100644
index 00000000000..54798309744
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectL.java
@@ -0,0 +1,45 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Model of the information gain based on Laplace's law of succession.
+ * @lucene.experimental
+ */
+public class AfterEffectL extends AfterEffect {
+ @Override
+ public final float score(BasicStats stats, float tfn) {
+ return 1 / (tfn + 1);
+ }
+
+ @Override
+ public final Explanation explain(BasicStats stats, float tfn) {
+ Explanation result = new Explanation();
+ result.setDescription(getClass().getSimpleName() + ", computed from: ");
+ result.setValue(score(stats, tfn));
+ result.addDetail(new Explanation(tfn, "tfn"));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "L";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
new file mode 100644
index 00000000000..c9c542af10c
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -0,0 +1,339 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
+import org.apache.lucene.util.TermContext;
+
+/**
+ * BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
+ * Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3.
+ * In Proceedings of the Third Text REtrieval Conference (TREC 1994).
+ * Gaithersburg, USA, November 1994.
+ * @lucene.experimental
+ */
+public class BM25Similarity extends Similarity {
+ private final float k1;
+ private final float b;
+ // TODO: should we add a delta like sifaka.cs.uiuc.edu/~ylv2/pub/sigir11-bm25l.pdf ?
+
+ public BM25Similarity(float k1, float b) {
+ this.k1 = k1;
+ this.b = b;
+ }
+
+ /** BM25 with these default values:
+ *
+ *
+ */
+ public BM25Similarity() {
+ this.k1 = 1.2f;
+ this.b = 0.75f;
+ }
+
+ /** Implemented as log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))
. */
+ protected float idf(int docFreq, int numDocs) {
+ return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+ }
+
+ /** Implemented as 1 / (distance + 1)
. */
+ protected float sloppyFreq(int distance) {
+ return 1.0f / (distance + 1);
+ }
+
+ /** The default implementation returns 1
*/
+ protected float scorePayload(int doc, int start, int end, BytesRef payload) {
+ return 1;
+ }
+
+ /** The default implementation computes the average as sumTotalTermFreq / maxDoc
,
+ * or returns 1
if the index does not store sumTotalTermFreq (Lucene 3.x indexes
+ * or any field that omits frequency information). */
+ protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
+ Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field);
+ if (terms == null) {
+ // field does not exist;
+ return 1f;
+ }
+ long sumTotalTermFreq = terms.getSumTotalTermFreq();
+ long maxdoc = searcher.maxDoc();
+ return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
+ }
+
+ /** The default implementation encodes boost / sqrt(length)
+ * with {@link SmallFloat#floatToByte315(float)}. This is compatible with
+ * Lucene's default implementation. If you change this, then you should
+ * change {@link #decodeNormValue(byte)} to match. */
+ protected byte encodeNormValue(float boost, int fieldLength) {
+ return SmallFloat.floatToByte315(boost / (float) Math.sqrt(fieldLength));
+ }
+
+ /** The default implementation returns 1 / f2
+ * where f
is {@link SmallFloat#byte315ToFloat(byte)}. */
+ protected float decodeNormValue(byte b) {
+ return NORM_TABLE[b & 0xFF];
+ }
+
+ // Default true
+ protected boolean discountOverlaps = true;
+
+ /** Determines whether overlap tokens (Tokens with 0 position increment) are
+ * ignored when computing norm. By default this is true, meaning overlap
+ * tokens do not count when computing norms. */
+ public void setDiscountOverlaps(boolean v) {
+ discountOverlaps = v;
+ }
+
+ /** @see #setDiscountOverlaps */
+ public boolean getDiscountOverlaps() {
+ return discountOverlaps;
+ }
+
+ /** Cache of decoded bytes. */
+ private static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++) {
+ float f = SmallFloat.byte315ToFloat((byte)i);
+ NORM_TABLE[i] = 1.0f / (f*f);
+ }
+ }
+
+ @Override
+ public final byte computeNorm(FieldInvertState state) {
+ final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
+ return encodeNormValue(state.getBoost(), numTerms);
+ }
+
+ public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
+ final int df = stats.docFreq();
+ final int max = searcher.maxDoc();
+ final float idf = idf(df, max);
+ return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+ }
+
+ public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
+ final int max = searcher.maxDoc();
+ float idf = 0.0f;
+ final Explanation exp = new Explanation();
+ exp.setDescription("idf(), sum of:");
+ for (final TermContext stat : stats ) {
+ final int df = stat.docFreq();
+ final float termIdf = idf(df, max);
+ exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+ idf += termIdf;
+ }
+ exp.setValue(idf);
+ return exp;
+ }
+
+ @Override
+ public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
+ Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher);
+
+ float avgdl = avgFieldLength(searcher, fieldName);
+
+ // compute freq-independent part of bm25 equation across all norm values
+ float cache[] = new float[256];
+ for (int i = 0; i < cache.length; i++) {
+ cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
+ }
+ return new BM25Stats(idf, queryBoost, avgdl, cache);
+ }
+
+ @Override
+ public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
+ final byte[] norms = context.reader.norms(fieldName);
+ return norms == null
+ ? new ExactBM25DocScorerNoNorms((BM25Stats)stats)
+ : new ExactBM25DocScorer((BM25Stats)stats, norms);
+ }
+
+ @Override
+ public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
+ return new SloppyBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName));
+ }
+
+ private class ExactBM25DocScorer extends ExactDocScorer {
+ private final BM25Stats stats;
+ private final float weightValue;
+ private final byte[] norms;
+ private final float[] cache;
+
+ ExactBM25DocScorer(BM25Stats stats, byte norms[]) {
+ assert norms != null;
+ this.stats = stats;
+ this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
+ this.cache = stats.cache;
+ this.norms = norms;
+ }
+
+ @Override
+ public float score(int doc, int freq) {
+ return weightValue * freq / (freq + cache[norms[doc] & 0xFF]);
+ }
+
+ @Override
+ public Explanation explain(int doc, Explanation freq) {
+ return explainScore(doc, freq, stats, norms);
+ }
+ }
+
+ /** there are no norms, we act as if b=0 */
+ private class ExactBM25DocScorerNoNorms extends ExactDocScorer {
+ private final BM25Stats stats;
+ private final float weightValue;
+ private static final int SCORE_CACHE_SIZE = 32;
+ private float[] scoreCache = new float[SCORE_CACHE_SIZE];
+
+ ExactBM25DocScorerNoNorms(BM25Stats stats) {
+ this.stats = stats;
+ this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
+ for (int i = 0; i < SCORE_CACHE_SIZE; i++)
+ scoreCache[i] = weightValue * i / (i + k1);
+ }
+
+ @Override
+ public float score(int doc, int freq) {
+ // TODO: maybe score cache is more trouble than its worth?
+ return freq < SCORE_CACHE_SIZE // check cache
+ ? scoreCache[freq] // cache hit
+ : weightValue * freq / (freq + k1); // cache miss
+ }
+
+ @Override
+ public Explanation explain(int doc, Explanation freq) {
+ return explainScore(doc, freq, stats, null);
+ }
+ }
+
+ private class SloppyBM25DocScorer extends SloppyDocScorer {
+ private final BM25Stats stats;
+ private final float weightValue; // boost * idf * (k1 + 1)
+ private final byte[] norms;
+ private final float[] cache;
+
+ SloppyBM25DocScorer(BM25Stats stats, byte norms[]) {
+ this.stats = stats;
+ this.weightValue = stats.weight * (k1 + 1);
+ this.cache = stats.cache;
+ this.norms = norms;
+ }
+
+ @Override
+ public float score(int doc, float freq) {
+ // if there are no norms, we act as if b=0
+ float norm = norms == null ? k1 : cache[norms[doc] & 0xFF];
+ return weightValue * freq / (freq + norm);
+ }
+
+ @Override
+ public Explanation explain(int doc, Explanation freq) {
+ return explainScore(doc, freq, stats, norms);
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ return sloppyFreq(distance);
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ return scorePayload(doc, start, end, payload);
+ }
+ }
+
+ /** Collection statistics for the BM25 model. */
+ private static class BM25Stats extends Stats {
+ /** BM25's idf */
+ private final Explanation idf;
+ /** The average document length. */
+ private final float avgdl;
+ /** query's inner boost */
+ private final float queryBoost;
+ /** weight (idf * boost) */
+ private float weight;
+ /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
+ private final float cache[];
+
+ BM25Stats(Explanation idf, float queryBoost, float avgdl, float cache[]) {
+ this.idf = idf;
+ this.queryBoost = queryBoost;
+ this.avgdl = avgdl;
+ this.cache = cache;
+ }
+
+ @Override
+ public float getValueForNormalization() {
+ // we return a TF-IDF like normalization to be nice, but we don't actually normalize ourselves.
+ final float queryWeight = idf.getValue() * queryBoost;
+ return queryWeight * queryWeight;
+ }
+
+ @Override
+ public void normalize(float queryNorm, float topLevelBoost) {
+ // we don't normalize with queryNorm at all, we just capture the top-level boost
+ this.weight = idf.getValue() * queryBoost * topLevelBoost;
+ }
+ }
+
+ private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, byte[] norms) {
+ Explanation result = new Explanation();
+ result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
+
+ Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
+ if (stats.queryBoost != 1.0f)
+ result.addDetail(boostExpl);
+
+ result.addDetail(stats.idf);
+
+ Explanation tfNormExpl = new Explanation();
+ tfNormExpl.setDescription("tfNorm, computed from:");
+ tfNormExpl.addDetail(freq);
+ tfNormExpl.addDetail(new Explanation(k1, "parameter k1"));
+ if (norms == null) {
+ tfNormExpl.addDetail(new Explanation(0, "parameter b (norms omitted for field)"));
+ tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1));
+ } else {
+ float doclen = decodeNormValue(norms[doc]);
+ tfNormExpl.addDetail(new Explanation(b, "parameter b"));
+ tfNormExpl.addDetail(new Explanation(stats.avgdl, "avgFieldLength"));
+ tfNormExpl.addDetail(new Explanation(doclen, "fieldLength"));
+ tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)));
+ }
+ result.addDetail(tfNormExpl);
+ result.setValue(boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue());
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "BM25(k1=" + k1 + ",b=" + b + ")";
+ }
+}
diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java
new file mode 100644
index 00000000000..ff4d12d2099
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModel.java
@@ -0,0 +1,60 @@
+package org.apache.lucene.search.similarities;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Explanation;
+
+/**
+ * This class acts as the base class for the specific basic model
+ * implementations in the DFR framework. Basic models compute the
+ * informative content Inf1 = -log2Prob1
+ * .
+ *
+ * @see DFRSimilarity
+ * @lucene.experimental
+ */
+public abstract class BasicModel {
+ /** Returns the informative content score. */
+ public abstract float score(BasicStats stats, float tfn);
+
+ /**
+ * Returns an explanation for the score.
+ *
+ *
+ *
The framework described in the paper has many similarities to the DFR + * framework (see {@link DFRSimilarity}). It is possible that the two + * Similarities will be merged at one point.
+ * @lucene.experimental + */ +public class IBSimilarity extends SimilarityBase { + /** The probabilistic distribution used to model term occurrence. */ + protected final Distribution distribution; + /** The lambda (λw) parameter. */ + protected final Lambda lambda; + /** The term frequency normalization. */ + protected final Normalization normalization; + + public IBSimilarity(Distribution distribution, + Lambda lambda, + Normalization normalization) { + this.distribution = distribution; + this.lambda = lambda; + this.normalization = normalization; + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + return stats.getTotalBoost() * + distribution.score( + stats, + normalization.tfn(stats, freq, docLen), + lambda.lambda(stats)); + } + + @Override + protected void explain( + Explanation expl, BasicStats stats, int doc, float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + Explanation normExpl = normalization.explain(stats, freq, docLen); + Explanation lambdaExpl = lambda.explain(stats); + expl.addDetail(normExpl); + expl.addDetail(lambdaExpl); + expl.addDetail(distribution.explain( + stats, normExpl.getValue(), lambdaExpl.getValue())); + } + + /** + * The name of IB methods follow the pattern + * {@code IB+ * The formula as defined the paper assigns a negative score to documents that + * contain the term, but with fewer occurrences than predicted by the collection + * language model. The Lucene implementation returns {@code 0} for such + * documents. + *
+ * + * @lucene.experimental + */ +public class LMDirichletSimilarity extends LMSimilarity { + /** The μ parameter. */ + private final float mu; + + /** @param mu the μ parameter. */ + public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { + super(collectionModel); + this.mu = mu; + } + + /** @param mu the μ parameter. */ + public LMDirichletSimilarity(float mu) { + this.mu = mu; + } + + /** Instantiates the similarity with the default μ value of 2000. */ + public LMDirichletSimilarity(CollectionModel collectionModel) { + this(collectionModel, 2000); + } + + /** Instantiates the similarity with the default μ value of 2000. */ + public LMDirichletSimilarity() { + this(2000); + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + float score = stats.getTotalBoost() * (float)(Math.log(1 + freq / + (mu * ((LMStats)stats).getCollectionProbability())) + + Math.log(mu / (docLen + mu))); + return score > 0.0f ? score : 0.0f; + } + + @Override + protected void explain(Explanation expl, BasicStats stats, int doc, + float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + + expl.addDetail(new Explanation(mu, "mu")); + Explanation weightExpl = new Explanation(); + weightExpl.setValue((float)Math.log(1 + freq / + (mu * ((LMStats)stats).getCollectionProbability()))); + weightExpl.setDescription("term weight"); + expl.addDetail(weightExpl); + expl.addDetail(new Explanation( + (float)Math.log(mu / (docLen + mu)), "document norm")); + super.explain(expl, stats, doc, freq, docLen); + } + + /** Returns the μ parameter. */ + public float getMu() { + return mu; + } + + @Override + public String getName() { + return String.format("Dirichlet(%f)", getMu()); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java new file mode 100644 index 00000000000..910e769d64c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -0,0 +1,77 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Language model based on the Jelinek-Mercer smoothing method. From Chengxiang + * Zhai and John Lafferty. 2001. A study of smoothing methods for language + * models applied to Ad Hoc information retrieval. In Proceedings of the 24th + * annual international ACM SIGIR conference on Research and development in + * information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342. + *The model has a single parameter, λ. According to said paper, the + * optimal value depends on both the collection and the query. The optimal value + * is around {@code 0.1} for title queries and {@code 0.7} for long queries.
+ * + * @lucene.experimental + */ +public class LMJelinekMercerSimilarity extends LMSimilarity { + /** The λ parameter. */ + private final float lambda; + + /** @param lambda the λ parameter. */ + public LMJelinekMercerSimilarity( + CollectionModel collectionModel, float lambda) { + super(collectionModel); + this.lambda = lambda; + } + + /** @param lambda the λ parameter. */ + public LMJelinekMercerSimilarity(float lambda) { + this.lambda = lambda; + } + + @Override + protected float score(BasicStats stats, float freq, float docLen) { + return stats.getTotalBoost() * + (float)Math.log(1 + + ((1 - lambda) * freq / docLen) / + (lambda * ((LMStats)stats).getCollectionProbability())); + } + + @Override + protected void explain(Explanation expl, BasicStats stats, int doc, + float freq, float docLen) { + if (stats.getTotalBoost() != 1.0f) { + expl.addDetail(new Explanation(stats.getTotalBoost(), "boost")); + } + expl.addDetail(new Explanation(lambda, "lambda")); + super.explain(expl, stats, doc, freq, docLen); + } + + /** Returns the λ parameter. */ + public float getLambda() { + return lambda; + } + + @Override + public String getName() { + return String.format("Jelinek-Mercer(%f)", getLambda()); + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java new file mode 100644 index 00000000000..bf1df961169 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -0,0 +1,155 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.TermContext; + +/** + * Abstract superclass for language modeling Similarities. The following inner + * types are introduced: + *Used in {@link #toString()}
. + */ + public abstract String getName(); + + /** + * Returns the name of the LM method. If a custom collection model strategy is + * used, its name is included as well. + * @see #getName() + * @see CollectionModel#getName() + * @see DefaultCollectionModel + */ + @Override + public String toString() { + String coll = collectionModel.getName(); + if (coll != null) { + return String.format("LM %s - %s", getName(), coll); + } else { + return String.format("LM %s", getName()); + } + } + + /** Stores the collection distribution of the current term. */ + public static class LMStats extends BasicStats { + /** The probability that the current term is generated by the collection. */ + private float collectionProbability; + + public LMStats(float queryBoost) { + super(queryBoost); + } + + /** + * Returns the probability that the current term is generated by the + * collection. + */ + public final float getCollectionProbability() { + return collectionProbability; + } + + /** + * Sets the probability that the current term is generated by the + * collection. + */ + public final void setCollectionProbability(float collectionProbability) { + this.collectionProbability = collectionProbability; + } + } + + /** A strategy for computing the collection language model. */ + public static interface CollectionModel { + /** + * Computes the probability {@code p(w|C)} according to the language model + * strategy for the current term. + */ + public float computeProbability(BasicStats stats); + + /** The name of the collection model strategy. */ + public String getName(); + } + + /** + * Models {@code p(w|C)} as the number of occurrences of the term in the + * collection, divided by the total number of tokens {@code + 1}. + */ + public static class DefaultCollectionModel implements CollectionModel { + @Override + public float computeProbability(BasicStats stats) { + return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1); + } + + @Override + public String getName() { + return null; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java b/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java new file mode 100644 index 00000000000..64b8c34fd85 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/Lambda.java @@ -0,0 +1,42 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * The lambda (λw) parameter in information-based + * models. + * @see IBSimilarity + * @lucene.experimental + */ +public abstract class Lambda { + /** Computes the lambda parameter. */ + public abstract float lambda(BasicStats stats); + /** Explains the lambda parameter. */ + public abstract Explanation explain(BasicStats stats); + + /** + * Subclasses must override this method to return the code of the lambda + * formula. Since the original paper is not very clear on this matter, and + * also uses the DFR naming scheme incorrectly, the codes here were chosen + * arbitrarily. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java new file mode 100644 index 00000000000..7e4a8240b87 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Computes lambda as {@code totalTermFreq / numberOfDocuments}. + * @lucene.experimental + */ +public class LambdaDF extends Lambda { + @Override + public final float lambda(BasicStats stats) { + return (float)stats.getDocFreq() / stats.getNumberOfDocuments(); + } + + @Override + public final Explanation explain(BasicStats stats) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(lambda(stats)); + result.addDetail( + new Explanation(stats.getDocFreq(), "docFreq")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + return result; + } + + @Override + public String toString() { + return "D"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java new file mode 100644 index 00000000000..25c55bd72ce --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Computes lambda as {@code docFreq / numberOfDocuments}. + * @lucene.experimental + */ +public class LambdaTTF extends Lambda { + @Override + public final float lambda(BasicStats stats) { + return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments(); + } + + @Override + public final Explanation explain(BasicStats stats) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(lambda(stats)); + result.addDetail( + new Explanation(stats.getTotalTermFreq(), "totalTermFreq")); + result.addDetail( + new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments")); + return result; + } + + @Override + public String toString() { + return "L"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java new file mode 100644 index 00000000000..46546ada55e --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java @@ -0,0 +1,159 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; + +/** + * Implements the CombSUM method for combining evidence from multiple + * similarity values described in: Joseph A. Shaw, Edward A. Fox. + * In Text REtrieval Conference (1993), pp. 243-252 + * @lucene.experimental + */ +public class MultiSimilarity extends Similarity { + protected final Similarity sims[]; + + public MultiSimilarity(Similarity sims[]) { + this.sims = sims; + } + + @Override + public byte computeNorm(FieldInvertState state) { + return sims[0].computeNorm(state); + } + + @Override + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { + Stats subStats[] = new Stats[sims.length]; + for (int i = 0; i < subStats.length; i++) { + subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts); + } + return new MultiStats(subStats); + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + ExactDocScorer subScorers[] = new ExactDocScorer[sims.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = sims[i].exactDocScorer(((MultiStats)stats).subStats[i], fieldName, context); + } + return new MultiExactDocScorer(subScorers); + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + SloppyDocScorer subScorers[] = new SloppyDocScorer[sims.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = sims[i].sloppyDocScorer(((MultiStats)stats).subStats[i], fieldName, context); + } + return new MultiSloppyDocScorer(subScorers); + } + + public static class MultiExactDocScorer extends ExactDocScorer { + private final ExactDocScorer subScorers[]; + + MultiExactDocScorer(ExactDocScorer subScorers[]) { + this.subScorers = subScorers; + } + + @Override + public float score(int doc, int freq) { + float sum = 0.0f; + for (ExactDocScorer subScorer : subScorers) { + sum += subScorer.score(doc, freq); + } + return sum; + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:"); + for (ExactDocScorer subScorer : subScorers) { + expl.addDetail(subScorer.explain(doc, freq)); + } + return expl; + } + } + + public static class MultiSloppyDocScorer extends SloppyDocScorer { + private final SloppyDocScorer subScorers[]; + + MultiSloppyDocScorer(SloppyDocScorer subScorers[]) { + this.subScorers = subScorers; + } + + @Override + public float score(int doc, float freq) { + float sum = 0.0f; + for (SloppyDocScorer subScorer : subScorers) { + sum += subScorer.score(doc, freq); + } + return sum; + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:"); + for (SloppyDocScorer subScorer : subScorers) { + expl.addDetail(subScorer.explain(doc, freq)); + } + return expl; + } + + @Override + public float computeSlopFactor(int distance) { + return subScorers[0].computeSlopFactor(distance); + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return subScorers[0].computePayloadFactor(doc, start, end, payload); + } + } + + public static class MultiStats extends Stats { + final Stats subStats[]; + + MultiStats(Stats subStats[]) { + this.subStats = subStats; + } + + @Override + public float getValueForNormalization() { + float sum = 0.0f; + for (Stats stat : subStats) { + sum += stat.getValueForNormalization(); + } + return sum / subStats.length; + } + + @Override + public void normalize(float queryNorm, float topLevelBoost) { + for (Stats stat : subStats) { + stat.normalize(queryNorm, topLevelBoost); + } + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java b/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java new file mode 100644 index 00000000000..f635baa1c48 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/Normalization.java @@ -0,0 +1,75 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * This class acts as the base class for the implementations of the term + * frequency normalization methods in the DFR framework. + * + * @see DFRSimilarity + * @lucene.experimental + */ +public abstract class Normalization { + /** Returns the normalized term frequency. + * @param len the field length. */ + public abstract float tfn(BasicStats stats, float tf, float len); + + /** Returns an explanation for the normalized term frequency. + *The default normalization methods use the field length of the document + * and the average field length to compute the normalized term frequency. + * This method provides a generic explanation for such methods. + * Subclasses that use other statistics must override this method.
+ */ + public Explanation explain(BasicStats stats, float tf, float len) { + Explanation result = new Explanation(); + result.setDescription(getClass().getSimpleName() + ", computed from: "); + result.setValue(tfn(stats, tf, len)); + result.addDetail(new Explanation(tf, "tf")); + result.addDetail( + new Explanation(stats.getAvgFieldLength(), "avgFieldLength")); + result.addDetail(new Explanation(len, "len")); + return result; + } + + /** Implementation used when there is no normalization. */ + public static final class NoNormalization extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return tf; + } + + @Override + public final Explanation explain(BasicStats stats, float tf, float len) { + return new Explanation(1, "no normalization"); + } + + @Override + public String toString() { + return ""; + } + } + + /** + * Subclasses must override this method to return the code of the + * normalization formula. Refer to the original paper for the list. + */ + @Override + public abstract String toString(); +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java new file mode 100644 index 00000000000..77b18055903 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java @@ -0,0 +1,34 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalization model that assumes a uniform distribution of the term frequency. + * @lucene.experimental + */ +public class NormalizationH1 extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return tf * stats.getAvgFieldLength() / len; + } + + @Override + public String toString() { + return "1"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java new file mode 100644 index 00000000000..9055e6f7f73 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java @@ -0,0 +1,37 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.search.similarities.SimilarityBase.log2; + +/** + * Normalization model in which the term frequency is inversely related to the + * length. + * @lucene.experimental + */ +public class NormalizationH2 extends Normalization { + @Override + public final float tfn(BasicStats stats, float tf, float len) { + return (float)(tf * log2(1 + stats.getAvgFieldLength() / len)); + } + + @Override + public String toString() { + return "2"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java new file mode 100644 index 00000000000..97bf86a1135 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Dirichlet Priors normalization + * @lucene.experimental + */ +public class NormalizationH3 extends Normalization { + private final float mu; + + public NormalizationH3() { + this(800F); + } + + public NormalizationH3(float mu) { + this.mu = mu; + } + + @Override + public float tfn(BasicStats stats, float tf, float len) { + return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu; + } + + @Override + public String toString() { + return "3(" + mu + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java new file mode 100644 index 00000000000..9f15288f222 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationZ.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Pareto-Zipf Normalization + * @lucene.experimental + */ +public class NormalizationZ extends Normalization { + final float z; + + public NormalizationZ() { + this(0.30F); + } + + public NormalizationZ(float z) { + this.z = z; + } + + @Override + public float tfn(BasicStats stats, float tf, float len) { + return (float)(tf * Math.pow(stats.avgFieldLength / len, z)); + } + + @Override + public String toString() { + return "Z(" + z + ")"; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java similarity index 96% rename from lucene/src/java/org/apache/lucene/search/Similarity.java rename to lucene/src/java/org/apache/lucene/search/similarities/Similarity.java index fe184be66c8..c0262eb21ca 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -25,6 +25,12 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; // javadoc import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Terms; // javadoc +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanQuery; // javadoc import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SmallFloat; // javadoc @@ -140,7 +146,7 @@ public abstract class Similarity { ** Term frequencies are integers (the term or phrase's tf) */ - public abstract class ExactDocScorer { + public static abstract class ExactDocScorer { /** * Score a single document * @param doc document id @@ -169,7 +175,7 @@ public abstract class Similarity { *
* Term frequencies are floating point values. */ - public abstract class SloppyDocScorer { + public static abstract class SloppyDocScorer { /** * Score a single document * @param doc document id diff --git a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java new file mode 100644 index 00000000000..4eb2c8c1315 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -0,0 +1,345 @@ +package org.apache.lucene.search.similarities; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.TermContext; + +/** + * A subclass of {@code Similarity} that provides a simplified API for its + * descendants. Subclasses are only required to implement the {@link #score} + * and {@link #toString()} methods. Implementing + * {@link #explain(Explanation, BasicStats, int, float, float)} is optional, + * inasmuch as SimilarityBase already provides a basic explanation of the score + * and the term frequency. However, implementers of a subclass are encouraged to + * include as much detail about the scoring method as possible. + *
+ * Note: multi-word queries such as phrase queries are scored in a different way + * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for + * the phrase as a whole (since it does not know it), this class instead scores + * phrases as a summation of the individual term scores. + * @lucene.experimental + */ +public abstract class SimilarityBase extends Similarity { + /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ + private static final double LOG_2 = Math.log(2); + + /** @see #setDiscountOverlaps */ + protected boolean discountOverlaps = true; + + /** Determines whether overlap tokens (Tokens with + * 0 position increment) are ignored when computing + * norm. By default this is true, meaning overlap + * tokens do not count when computing norms. + * + * @lucene.experimental + * + * @see #computeNorm + */ + public void setDiscountOverlaps(boolean v) { + discountOverlaps = v; + } + + /** @see #setDiscountOverlaps */ + public boolean getDiscountOverlaps() { + return discountOverlaps; + } + + @Override + public final Stats computeStats(IndexSearcher searcher, String fieldName, + float queryBoost, TermContext... termContexts) throws IOException { + BasicStats stats[] = new BasicStats[termContexts.length]; + for (int i = 0; i < termContexts.length; i++) { + stats[i] = newStats(queryBoost); + fillBasicStats(stats[i], searcher, fieldName, termContexts[i]); + } + return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); + } + + /** Factory method to return a custom stats object */ + protected BasicStats newStats(float queryBoost) { + return new BasicStats(queryBoost); + } + + /** Fills all member fields defined in {@code BasicStats} in {@code stats}. + * Subclasses can override this method to fill additional stats. */ + protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, + String fieldName, TermContext termContext) throws IOException { + IndexReader reader = searcher.getIndexReader(); + int numberOfDocuments = reader.maxDoc(); + + int docFreq = termContext.docFreq(); + long totalTermFreq = termContext.totalTermFreq(); + + // codec does not supply totalTermFreq: substitute docFreq + if (totalTermFreq == -1) { + totalTermFreq = docFreq; + } + + final long numberOfFieldTokens; + final float avgFieldLength; + + Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName); + if (terms == null) { + // field does not exist; + numberOfFieldTokens = 0; + avgFieldLength = 1; + } else { + long sumTotalTermFreq = terms.getSumTotalTermFreq(); + + // We have to provide something if codec doesnt supply these measures, + // or if someone omitted frequencies for the field... negative values cause + // NaN/Inf for some scorers. + if (sumTotalTermFreq == -1) { + numberOfFieldTokens = docFreq; + avgFieldLength = 1; + } else { + numberOfFieldTokens = sumTotalTermFreq; + avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; + } + } + + // TODO: add sumDocFreq for field (numberOfFieldPostings) + stats.setNumberOfDocuments(numberOfDocuments); + stats.setNumberOfFieldTokens(numberOfFieldTokens); + stats.setAvgFieldLength(avgFieldLength); + stats.setDocFreq(docFreq); + stats.setTotalTermFreq(totalTermFreq); + } + + /** + * Scores the document {@code doc}. + *
Subclasses must apply their scoring formula in this class.
+ * @param stats the corpus level statistics. + * @param freq the term frequency. + * @param docLen the document length. + * @return the score. + */ + protected abstract float score(BasicStats stats, float freq, float docLen); + + /** + * Subclasses should implement this method to explain the score. {@code expl} + * already contains the score, the name of the class and the doc id, as well + * as the term frequency and its explanation; subclasses can add additional + * clauses to explain details of their scoring formulae. + *The default implementation does nothing.
+ * + * @param expl the explanation to extend with details. + * @param stats the corpus level statistics. + * @param doc the document id. + * @param freq the term frequency. + * @param docLen the document length. + */ + protected void explain( + Explanation expl, BasicStats stats, int doc, float freq, float docLen) {} + + /** + * Explains the score. The implementation here provides a basic explanation + * in the format score(name-of-similarity, doc=doc-id, + * freq=term-frequency), computed from:, and + * attaches the score (computed via the {@link #score(BasicStats, float, float)} + * method) and the explanation for the term frequency. Subclasses content with + * this format may add additional details in + * {@link #explain(Explanation, BasicStats, int, float, float)}. + * + * @param stats the corpus level statistics. + * @param doc the document id. + * @param freq the term frequency and its explanation. + * @param docLen the document length. + * @return the explanation. + */ + protected Explanation explain( + BasicStats stats, int doc, Explanation freq, float docLen) { + Explanation result = new Explanation(); + result.setValue(score(stats, freq.getValue(), docLen)); + result.setDescription("score(" + getClass().getSimpleName() + + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:"); + result.addDetail(freq); + + explain(result, stats, doc, freq.getValue(), docLen); + + return result; + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, + AtomicReaderContext context) throws IOException { + byte norms[] = context.reader.norms(fieldName); + + if (stats instanceof MultiSimilarity.MultiStats) { + // a multi term query (e.g. phrase). return the summation, + // scoring almost as if it were boolean query + Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; + ExactDocScorer subScorers[] = new ExactDocScorer[subStats.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = new BasicExactDocScorer((BasicStats)subStats[i], norms); + } + return new MultiSimilarity.MultiExactDocScorer(subScorers); + } else { + return new BasicExactDocScorer((BasicStats) stats, norms); + } + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, + AtomicReaderContext context) throws IOException { + byte norms[] = context.reader.norms(fieldName); + + if (stats instanceof MultiSimilarity.MultiStats) { + // a multi term query (e.g. phrase). return the summation, + // scoring almost as if it were boolean query + Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; + SloppyDocScorer subScorers[] = new SloppyDocScorer[subStats.length]; + for (int i = 0; i < subScorers.length; i++) { + subScorers[i] = new BasicSloppyDocScorer((BasicStats)subStats[i], norms); + } + return new MultiSimilarity.MultiSloppyDocScorer(subScorers); + } else { + return new BasicSloppyDocScorer((BasicStats) stats, norms); + } + } + + /** + * Subclasses must override this method to return the name of the Similarity + * and preferably the values of parameters (if any) as well. + */ + @Override + public abstract String toString(); + + // ------------------------------ Norm handling ------------------------------ + + /** Norm -> document length map. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + float floatNorm = SmallFloat.byte315ToFloat((byte)i); + NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm); + } + } + + /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */ + @Override + public byte computeNorm(FieldInvertState state) { + final float numTerms; + if (discountOverlaps) + numTerms = state.getLength() - state.getNumOverlap(); + else + numTerms = state.getLength() / state.getBoost(); + return encodeNormValue(state.getBoost(), numTerms); + } + + /** Decodes a normalization factor (document length) stored in an index. + * @see #encodeNormValue(float,float) + */ + protected float decodeNormValue(byte norm) { + return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + /** Encodes the length to a byte via SmallFloat. */ + protected byte encodeNormValue(float boost, float length) { + return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length))); + } + + // ----------------------------- Static methods ------------------------------ + + /** Returns the base two logarithm of {@code x}. */ + public static double log2(double x) { + // Put this to a 'util' class if we need more of these. + return Math.log(x) / LOG_2; + } + + // --------------------------------- Classes --------------------------------- + + /** Delegates the {@link #score(int, int)} and + * {@link #explain(int, Explanation)} methods to + * {@link SimilarityBase#score(BasicStats, float, int)} and + * {@link SimilarityBase#explain(BasicStats, int, Explanation, int)}, + * respectively. + */ + private class BasicExactDocScorer extends ExactDocScorer { + private final BasicStats stats; + private final byte[] norms; + + BasicExactDocScorer(BasicStats stats, byte norms[]) { + this.stats = stats; + this.norms = norms; + } + + @Override + public float score(int doc, int freq) { + // We have to supply something in case norms are omitted + return SimilarityBase.this.score(stats, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return SimilarityBase.this.explain(stats, doc, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + } + + /** Delegates the {@link #score(int, int)} and + * {@link #explain(int, Explanation)} methods to + * {@link SimilarityBase#score(BasicStats, float, int)} and + * {@link SimilarityBase#explain(BasicStats, int, Explanation, int)}, + * respectively. + */ + private class BasicSloppyDocScorer extends SloppyDocScorer { + private final BasicStats stats; + private final byte[] norms; + + BasicSloppyDocScorer(BasicStats stats, byte norms[]) { + this.stats = stats; + this.norms = norms; + } + + @Override + public float score(int doc, float freq) { + // We have to supply something in case norms are omitted + return SimilarityBase.this.score(stats, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + @Override + public Explanation explain(int doc, Explanation freq) { + return SimilarityBase.this.explain(stats, doc, freq, + norms == null ? 1F : decodeNormValue(norms[doc])); + } + + @Override + public float computeSlopFactor(int distance) { + return 1.0f / (distance + 1); + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return 1f; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/search/SimilarityProvider.java b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java similarity index 96% rename from lucene/src/java/org/apache/lucene/search/SimilarityProvider.java rename to lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java index ef9a034e7eb..e3f6e86dca0 100644 --- a/lucene/src/java/org/apache/lucene/search/SimilarityProvider.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityProvider.java @@ -1,4 +1,6 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; + +import org.apache.lucene.search.BooleanQuery; /** * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java similarity index 97% rename from lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java rename to lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index 4209dd490cf..163fcb2c966 100644 --- a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.search.similarities; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,6 +21,10 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TermContext; import org.apache.lucene.util.SmallFloat; @@ -303,13 +307,13 @@ import org.apache.lucene.util.SmallFloat; * two term-queries with that same term and hence the computation would still be correct (although * not very efficient). * The default computation for tf(t in d) in - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) DefaultSimilarity} is: * *- * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} = + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) tf(t in d)} = * | *
* frequency½
@@ -328,13 +332,13 @@ import org.apache.lucene.util.SmallFloat;
* idf(t) appears for t in both the query and the document,
* hence it is squared in the equation.
* The default computation for idf(t) in
- * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
+ * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* *
|