The boost is multiplied by {@link Document#getBoost()} of the document * containing this field. If a document has multiple fields with the same * name, all such values are multiplied together. This product is then - * multipled by the value {@link Similarity#normalizeLength(int)}, and + * multipled by the value {@link Similarity#lengthNorm(String,int)}, and * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * * @see Document#setBoost(float) - * @see Similarity#normalizeLength(int) + * @see Similarity#lengthNorm(String, int) * @see Similarity#encodeNorm(float) */ public void setBoost(float boost) { diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index 79167e90c9d..d0b695dfe35 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -73,13 +73,16 @@ import org.apache.lucene.search.Similarity; final class DocumentWriter { private Analyzer analyzer; private Directory directory; + private Similarity similarity; private FieldInfos fieldInfos; private int maxFieldLength; - - DocumentWriter(Directory d, Analyzer a, int mfl) { - directory = d; - analyzer = a; - maxFieldLength = mfl; + + DocumentWriter(Directory directory, Analyzer analyzer, + Similarity similarity, int maxFieldLength) { + this.directory = directory; + this.analyzer = analyzer; + this.similarity = similarity; + this.maxFieldLength = maxFieldLength; } final void addDocument(String segment, Document doc) @@ -320,10 +323,10 @@ final class DocumentWriter { if (field.isIndexed()) { int n = fieldInfos.fieldNumber(field.name()); float norm = - fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]); + fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]); OutputStream norms = directory.createFile(segment + ".f" + n); try { - norms.writeByte(Similarity.encodeNorm(norm)); + norms.writeByte(similarity.encodeNorm(norm)); } finally { norms.close(); } diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index 0238a42211c..846776db5b3 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -68,6 +68,8 @@ import org.apache.lucene.store.OutputStream; import org.apache.lucene.search.Similarity; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Similarity; + /** An IndexWriter creates and maintains an index. @@ -89,12 +91,28 @@ public class IndexWriter { private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text + private Similarity similarity = Similarity.getDefault(); // how to normalize + private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private final Directory ramDirectory = new RAMDirectory(); // for temp segs private Lock writeLock; - private Similarity similarity; + /** Expert: Set the Similarity implementation used by this IndexWriter. + * + * @see Similarity#setDefault(Similarity) + */ + public void setSimilarity(Similarity similarity) { + this.similarity = similarity; + } + + /** Expert: Return the Similarity implementation used by this IndexWriter. + * + *
This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity() {
+ return this.similarity;
+ }
/** Constructs an IndexWriter for the index in Public only so that the indexing code can compute and store the
- * normalization byte for each document. */
+/** Expert: Scoring API.
+ * Subclasses implement search scoring.
+ *
+ * The score of query This is initially an instance of {@link DefaultSimilarity}.
+ *
+ * @see Searcher#setSimilarity(Similarity)
+ * @see IndexWriter#setSimilarity(Similarity)
+ */
+ public static Similarity getDefault() {
+ return Similarity.defaultImpl;
+ }
+
+ /** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
@@ -70,31 +129,6 @@ public abstract class Similarity {
NORM_TABLE[i] = byteToFloat((byte)i);
}
- private static Similarity similarity;
-
- private Similarity() {} // no public constructor
-
- /**
- * Sets the Matches in longer fields are less precise, so implemenations of this
+ * method usually return smaller values when That these values are computed under {@link
+ * IndexWriter#addDocument(Document)} and stored then using
+ * {#encodeNorm(float)}. Thus they have limited precision, and documents
+ * must be re-indexed if this method is altered.
+ *
+ * @param fieldName the name of the field
+ * @param numTokens the total number of tokens contained in fields named
+ * fieldName of doc.
+ * @return a normalization factor for hits on this field of this document
+ *
+ * @see Field#setBoost(float)
+ */
+ public abstract float lengthNorm(String fieldName, int numTokens);
+
+ /** Computes the normalization value for a query given the sum of the squared
+ * weights of each of the query terms. This value is then multipled into the
+ * weight of each query term.
+ *
+ * This does not affect ranking, but rather just attempts to make scores
+ * from different queries comparable.
+ *
+ * @param sumOfSquaredWeights the sum of the squares of query term weights
+ * @return a normalization factor for query weights
+ */
+ public abstract float queryNorm(float sumOfSquaredWeights);
+
/** Encodes a normalization factor for storage in an index.
*
* The encoding uses a five-bit exponent and three-bit mantissa, thus
@@ -151,25 +220,118 @@ public abstract class Similarity {
return (byte)((exponent << 3) | mantissa); // pack into a byte
}
- static final float tf(int freq) {
- return (float)Math.sqrt(freq);
+
+ /** Computes a score factor based on a term or phrase's frequency in a
+ * document. This value is multiplied by the {@link #idf(Term, Searcher)}
+ * factor for each term in the query and these products are then summed to
+ * form the initial score for a document.
+ *
+ * Terms and phrases repeated in a document indicate the topic of the
+ * document, so implemenations of this method usually return larger values
+ * when The default implementation calls {@link #tf(float)}.
+ *
+ * @param tf the frequency of a term within a document
+ * @return a score factor based on a term's within-document frequency
+ */
+ public float tf(int freq) {
+ return tf((float)freq);
}
- static final float tf(float freq) {
- return (float)Math.sqrt(freq);
- }
+ /** Computes the amount of a sloppy phrase match, based on an edit distance.
+ * This value is summed for each sloppy phrase match in a document to form
+ * the frequency that is passed to {@link #tf(float)}.
+ *
+ * A phrase match with a small edit distance to a document passage more
+ * closely matches the document, so implemenations of this method usually
+ * return larger values when the edit distance is small and smaller values
+ * when it is large.
+ *
+ * @see PhraseQuery#setSlop(int)
+ * @param distance the edit distance of this sloppy phrase match
+ * @return the frequency increment for this match
+ */
+ public abstract float sloppyFreq(int distance);
+
+ /** Computes a score factor based on a term or phrase's frequency in a
+ * document. This value is multiplied by the {@link #idf(Term, Searcher)}
+ * factor for each term in the query and these products are then summed to
+ * form the initial score for a document.
+ *
+ * Terms and phrases repeated in a document indicate the topic of the
+ * document, so implemenations of this method usually return larger values
+ * when The default implementation is: The default implementation sums the {@link #idf(Term,Searcher)} factor
+ * for each term in the phrase.
+ *
+ * @param terms the vector of terms in the phrase
+ * @param searcher the document collection being searched
+ * @return a score factor for the phrase
+ */
+ public float idf(Vector terms, Searcher searcher) throws IOException {
+ float idf = 0.0f;
+ for (int i = 0; i < terms.size(); i++) {
+ idf += idf((Term)terms.elementAt(i), searcher);
+ }
+ return idf;
}
+
+ /** Computes a score factor based on a term's document frequency (the number
+ * of documents which contain the term). This value is multiplied by the
+ * {@link #tf(int)} factor for each term in the query and these products are
+ * then summed to form the initial score for a document.
+ *
+ * Terms that occur in fewer documents are better indicators of topic, so
+ * implemenations of this method usually return larger values for rare terms,
+ * and smaller values for common terms.
+ *
+ * @param docFreq the number of documents which contain the term
+ * @param numDocs the total number of documents in the collection
+ * @return a score factor based on the term's document frequency
+ */
+ protected abstract float idf(int docFreq, int numDocs);
- static final float coord(int overlap, int maxOverlap) {
- return overlap / (float)maxOverlap;
- }
+ /** Computes a score factor based on the fraction of all query terms that a
+ * document contains. This value is multiplied into scores.
+ *
+ * The presence of a large portion of the query terms indicates a better
+ * match with the query, so implemenations of this method usually return
+ * larger values when the ratio between these parameters is large and smaller
+ * values when the ratio between them is small.
+ *
+ * @param overlap the number of query terms matched in the document
+ * @param maxOverlap the total number of terms in the query
+ * @return a score factor based on term overlap with the query
+ */
+ public abstract float coord(int overlap, int maxOverlap);
}
diff --git a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
index 74bec4a5343..c3afa75b485 100644
--- a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@@ -62,10 +62,10 @@ import org.apache.lucene.index.*;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
- SloppyPhraseScorer(TermPositions[] tps, int s, byte[] n, float w)
- throws IOException {
- super(tps, n, w);
- slop = s;
+ SloppyPhraseScorer(TermPositions[] tps, Similarity similarity,
+ int slop, byte[] norms, float weight) throws IOException {
+ super(tps, similarity, norms, weight);
+ this.slop = slop;
}
protected final float phraseFreq() throws IOException {
@@ -94,7 +94,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
int matchLength = end - start;
if (matchLength <= slop)
- freq += 1.0 / (matchLength + 1); // penalize longer matches
+ freq += getSimilarity().sloppyFreq(matchLength); // score match
if (pp.position > end)
end = pp.position;
diff --git a/src/java/org/apache/lucene/search/TermQuery.java b/src/java/org/apache/lucene/search/TermQuery.java
index 2fd54bc80c8..3e666a49764 100644
--- a/src/java/org/apache/lucene/search/TermQuery.java
+++ b/src/java/org/apache/lucene/search/TermQuery.java
@@ -73,7 +73,7 @@ public class TermQuery extends Query {
}
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
- idf = Similarity.idf(term, searcher);
+ idf = searcher.getSimilarity().idf(term, searcher);
weight = idf * boost;
return weight * weight; // square term weights
}
@@ -83,14 +83,15 @@ public class TermQuery extends Query {
weight *= idf; // factor from document
}
- Scorer scorer(IndexReader reader)
+ Scorer scorer(IndexReader reader, Similarity similarity)
throws IOException {
TermDocs termDocs = reader.termDocs(term);
if (termDocs == null)
return null;
- return new TermScorer(termDocs, reader.norms(term.field()), weight);
+ return new TermScorer(termDocs, similarity,
+ reader.norms(term.field()), weight);
}
/** Prints a user-readable version of this query. */
diff --git a/src/java/org/apache/lucene/search/TermScorer.java b/src/java/org/apache/lucene/search/TermScorer.java
index 76637131873..7582c66e7ec 100644
--- a/src/java/org/apache/lucene/search/TermScorer.java
+++ b/src/java/org/apache/lucene/search/TermScorer.java
@@ -63,21 +63,23 @@ final class TermScorer extends Scorer {
private float weight;
private int doc;
- private final int[] docs = new int[128]; // buffered doc numbers
- private final int[] freqs = new int[128]; // buffered term freqs
+ private final int[] docs = new int[32]; // buffered doc numbers
+ private final int[] freqs = new int[32]; // buffered term freqs
private int pointer;
private int pointerMax;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
- TermScorer(TermDocs td, byte[] n, float w) throws IOException {
- termDocs = td;
- norms = n;
- weight = w;
+ TermScorer(TermDocs td, Similarity similarity, byte[] norms, float weight)
+ throws IOException {
+ super(similarity);
+ this.termDocs = td;
+ this.norms = norms;
+ this.weight = weight;
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
- scoreCache[i] = Similarity.tf(i) * weight;
+ scoreCache[i] = getSimilarity().tf(i) * weight;
pointerMax = termDocs.read(docs, freqs); // fill buffers
@@ -91,12 +93,13 @@ final class TermScorer extends Scorer {
final void score(HitCollector c, final int end) throws IOException {
int d = doc; // cache doc in local
+ Similarity similarity = getSimilarity(); // cache sim in local
while (d < end) { // for docs in window
final int f = freqs[pointer];
float score = // compute tf(f)*weight
f < SCORE_CACHE_SIZE // check cache
? scoreCache[f] // cache hit
- : Similarity.tf(f)*weight; // cache miss
+ : similarity.tf(f)*weight; // cache miss
score *= Similarity.decodeNorm(norms[d]); // normalize for field
diff --git a/src/test/org/apache/lucene/index/DocTest.java b/src/test/org/apache/lucene/index/DocTest.java
index b5f9116f992..ffa5ae09cc2 100644
--- a/src/test/org/apache/lucene/index/DocTest.java
+++ b/src/test/org/apache/lucene/index/DocTest.java
@@ -59,6 +59,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.document.Document;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.demo.FileDocument;
import java.io.File;
@@ -95,7 +96,8 @@ class DocTest {
throws Exception {
Directory directory = FSDirectory.getDirectory("test", false);
Analyzer analyzer = new SimpleAnalyzer();
- DocumentWriter writer = new DocumentWriter(directory, analyzer, 1000);
+ DocumentWriter writer =
+ new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000);
File file = new File(fileName);
Document doc = FileDocument.Document(file);
diff --git a/src/test/org/apache/lucene/search/TestDocBoost.java b/src/test/org/apache/lucene/search/TestDocBoost.java
index c9e2c1a8d85..7ed8d3b41b2 100644
--- a/src/test/org/apache/lucene/search/TestDocBoost.java
+++ b/src/test/org/apache/lucene/search/TestDocBoost.java
@@ -76,7 +76,7 @@ public class TestDocBoost extends TestCase {
super(name);
}
- public static void test() throws Exception {
+ public void testDocBoost() throws Exception {
RAMDirectory store = new RAMDirectory();
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
diff --git a/src/test/org/apache/lucene/search/TestSimilarity.java b/src/test/org/apache/lucene/search/TestSimilarity.java
new file mode 100644
index 00000000000..b095def7daa
--- /dev/null
+++ b/src/test/org/apache/lucene/search/TestSimilarity.java
@@ -0,0 +1,161 @@
+package org.apache.lucene.search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * path
. Text will
be analyzed with a
. If create
is true, then a
@@ -186,7 +204,7 @@ public class IndexWriter {
/** Adds a document to this index.*/
public void addDocument(Document doc) throws IOException {
DocumentWriter dw =
- new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
+ new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
String segmentName = newSegmentName();
dw.addDocument(segmentName, doc);
synchronized (this) {
@@ -407,13 +425,4 @@ public class IndexWriter {
}
directory.renameFile("deleteable.new", "deletable");
}
-
- /**
- * Sets the Similarity
implementation to use.
- *
- * @param sim an instance of a class that implements Similarity
.
+ */
+
+import org.apache.lucene.document.Document;
+
+/** Expert: Default scoring implementation. */
+public class DefaultSimilarity extends Similarity {
+ /** Implemented as 1/sqrt(numTerms)
. */
+ public float lengthNorm(String fieldName, int numTerms) {
+ return (float)(1.0 / Math.sqrt(numTerms));
+ }
+
+ /** Implemented as 1/sqrt(sumOfSquaredWeights)
. */
+ public float queryNorm(float sumOfSquaredWeights) {
+ return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
+ }
+
+ /** Implemented as sqrt(freq)
. */
+ public float tf(float freq) {
+ return (float)Math.sqrt(freq);
+ }
+
+ /** Implemented as 1 / (distance + 1)
. */
+ public float sloppyFreq(int distance) {
+ return 1.0f / (distance + 1);
+ }
+
+ /** Implemented as log(numDocs/(docFreq+1)) + 1
. */
+ public float idf(int docFreq, int numDocs) {
+ return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+ }
+
+ /** Implemented as overlap / maxOverlap
. */
+ public float coord(int overlap, int maxOverlap) {
+ return overlap / (float)maxOverlap;
+ }
+}
diff --git a/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index 46c590fd68c..c33c5c59ba3 100644
--- a/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -61,9 +61,9 @@ import org.apache.lucene.index.*;
final class ExactPhraseScorer extends PhraseScorer {
- ExactPhraseScorer(TermPositions[] tps, byte[] n, float w)
- throws IOException {
- super(tps, n, w);
+ ExactPhraseScorer(TermPositions[] tps, Similarity similarity,
+ byte[] norms, float weight) throws IOException {
+ super(tps, similarity, norms, weight);
}
protected final float phraseFreq() throws IOException {
diff --git a/src/java/org/apache/lucene/search/MultiTermQuery.java b/src/java/org/apache/lucene/search/MultiTermQuery.java
index 668f87a0118..776acc97793 100644
--- a/src/java/org/apache/lucene/search/MultiTermQuery.java
+++ b/src/java/org/apache/lucene/search/MultiTermQuery.java
@@ -85,7 +85,6 @@ public class MultiTermQuery extends Query {
/** Constructs a query for terms matching term
. */
public MultiTermQuery(Term term) {
this.term = term;
- this.query = query;
}
/** Set the TermEnum to be used */
@@ -105,8 +104,9 @@ public class MultiTermQuery extends Query {
}
}
- final Scorer scorer(IndexReader reader) throws IOException {
- return getQuery().scorer(reader);
+ final Scorer scorer(IndexReader reader, Similarity similarity)
+ throws IOException {
+ return getQuery().scorer(reader, similarity);
}
private final BooleanQuery getQuery() throws IOException {
diff --git a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
index 7a9f94a7934..36a85539e48 100644
--- a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
+++ b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
@@ -147,7 +147,7 @@ public class PhrasePrefixQuery
_termArrays.add(terms);
}
- Scorer scorer(IndexReader reader)
+ Scorer scorer(IndexReader reader, Similarity similarity)
throws IOException
{
if (_termArrays.size() == 0) // optimize zero-term case
@@ -161,7 +161,7 @@ public class PhrasePrefixQuery
for (int i=0; iquery
. */
public final Hits search(Query query) throws IOException {
return search(query, (Filter)null);
@@ -91,12 +88,22 @@ public abstract class Searcher implements Searchable {
search(query, (Filter)null, results);
}
- /**
- * Sets the Similarity
implementation to use.
+ /** The Similarity implementation used by this searcher. */
+ private Similarity similarity = Similarity.getDefault();
+
+ /** Expert: Set the Similarity implementation used by this Searcher.
*
- * @param sim an instance of a class that implements Similarity
This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity() {
+ return this.similarity;
}
}
diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java
index c525bc32e3e..459615f2943 100644
--- a/src/java/org/apache/lucene/search/Similarity.java
+++ b/src/java/org/apache/lucene/search/Similarity.java
@@ -55,14 +55,73 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
+import java.util.Vector;
import org.apache.lucene.index.Term;
+import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
-/** Internal class used for scoring.
- * q
for document d
is defined
+ * in terms of these methods as follows:
+ *
+ *
+ *
+ *
+ * @see #setDefault(Similarity)
+ * @see IndexWriter#setSimilarity(Similarity)
+ * @see Searcher#setSimilarity(Similarity)
+ */
public abstract class Similarity {
+ /** The Similarity implementation used by default. */
+ private static Similarity defaultImpl = new DefaultSimilarity();
+ /** Set the default Similarity implementation used by indexing and search
+ * code.
+ *
+ * @see Searcher#setSimilarity(Similarity)
+ * @see IndexWriter#setSimilarity(Similarity)
+ */
+ public static void setDefault(Similarity similarity) {
+ Similarity.defaultImpl = similarity;
+ }
+
+ /** Return the default Similarity implementation used by indexing and search
+ * code.
+ *
+ *
+ *
+ * score(q,d) =
+ *
+ * Σ
+ *
+ * {@link #tf(int) tf}(t in d) *
+ * {@link #idf(Term,Searcher) idf}(t) *
+ * {@link Field#getBoost getBoost}(t.field in d) *
+ * {@link #lengthNorm(String,int) lengthNorm}(t.field in d)
+ *
+ * *
+ * {@link #coord(int,int) coord}(q,d) *
+ * {@link #queryNorm(float) queryNorm}(q)
+ *
+ *
+ *
+ *
+ * t in q
+ *
+ * Similarity
implementation to use.
- *
- * @param sim an instance of a class that implements Similarity
The formula used is: 1.0f / Math.sqrt(numTerms)
- *
- * @see Field#setBoost(float)
- */
- public static float normalizeLength(int numTerms) {
- return (float)(1.0 / Math.sqrt(numTerms));
- }
-
/** Decodes a normalization factor stored in an index.
* @see #encodeNorm(float)
*/
@@ -102,6 +136,41 @@ public abstract class Similarity {
return NORM_TABLE[b & 0xFF];
}
+ /** Computes the normalization value for a field given the total number of
+ * terms contained in a field. These values, together with field boosts, are
+ * stored in an index and multipled into scores for hits on each field by the
+ * search code.
+ *
+ * numTokens
is large,
+ * and larger values when numTokens
is small.
+ *
+ * freq
is large, and smaller values when freq
+ * is small.
+ *
+ * freq
is large, and smaller values when freq
+ * is small.
+ *
+ * @param tf the frequency of a term within a document
+ * @return a score factor based on a term's within-document frequency
+ */
+ public abstract float tf(float freq);
- static final float idf(Term term, Searcher searcher) throws IOException {
- // Use maxDoc() instead of numDocs() because its proportional to docFreq(),
- // i.e., when one is inaccurate, so is the other, and in the same way.
+ /** Computes a score factor for a simple term.
+ *
+ *
+ * return idf(searcher.docFreq(term), searcher.maxDoc());
+ *
+ *
+ * Note that {@link Searcher#maxDoc()} is used instead of {@link
+ * IndexReader#numDocs()} because it is proportional to {@link
+ * Searcher#docFreq(Term)} , i.e., when one is inaccurate, so is the other,
+ * and in the same direction.
+ *
+ * @param term the term in question
+ * @param searcher the document collection being searched
+ * @return a score factor for the term
+ */
+ public float idf(Term term, Searcher searcher) throws IOException {
return idf(searcher.docFreq(term), searcher.maxDoc());
}
- static final float idf(int docFreq, int numDocs) {
- return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+ /** Computes a score factor for a phrase.
+ *
+ *