From 6772e7567d96b0b670828a38866507012a8112e3 Mon Sep 17 00:00:00 2001 From: Doug Cutting Date: Thu, 7 Nov 2002 17:31:27 +0000 Subject: [PATCH] Added a public, extensible scoring API. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149885 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + build.xml | 32 ++- default.properties | 1 - .../apache/lucene/analysis/ru/package.html | 5 + .../org/apache/lucene/document/Field.java | 4 +- .../apache/lucene/index/DocumentWriter.java | 17 +- .../org/apache/lucene/index/IndexWriter.java | 31 ++- .../apache/lucene/search/BooleanQuery.java | 8 +- .../apache/lucene/search/BooleanScorer.java | 6 +- .../lucene/search/DefaultSimilarity.java | 90 +++++++ .../lucene/search/ExactPhraseScorer.java | 6 +- .../apache/lucene/search/MultiTermQuery.java | 6 +- .../lucene/search/PhrasePrefixQuery.java | 12 +- .../org/apache/lucene/search/PhraseQuery.java | 17 +- .../apache/lucene/search/PhraseScorer.java | 11 +- .../org/apache/lucene/search/PrefixQuery.java | 4 +- src/java/org/apache/lucene/search/Query.java | 17 +- .../org/apache/lucene/search/RangeQuery.java | 4 +- src/java/org/apache/lucene/search/Scorer.java | 10 + .../org/apache/lucene/search/Searcher.java | 23 +- .../org/apache/lucene/search/Similarity.java | 244 +++++++++++++++--- .../lucene/search/SloppyPhraseScorer.java | 10 +- .../org/apache/lucene/search/TermQuery.java | 7 +- .../org/apache/lucene/search/TermScorer.java | 19 +- src/test/org/apache/lucene/index/DocTest.java | 4 +- .../apache/lucene/search/TestDocBoost.java | 2 +- .../apache/lucene/search/TestSimilarity.java | 161 ++++++++++++ 27 files changed, 609 insertions(+), 145 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/ru/package.html create mode 100644 src/java/org/apache/lucene/search/DefaultSimilarity.java create mode 100644 src/test/org/apache/lucene/search/TestSimilarity.java diff --git a/CHANGES.txt b/CHANGES.txt index 3c80f8c3f2a..3252cee8085 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -93,6 +93,9 @@ $Id$ 17. Added Russian Analyzer. (Boris Okner via otis) + 18. Added a public, extensible scoring API. For details, see the + javadoc for org.apache.lucene.search.Similarity. + 1.2 RC6 diff --git a/build.xml b/build.xml index 03a15dc41a6..096d5e4cb9a 100644 --- a/build.xml +++ b/build.xml @@ -12,14 +12,21 @@ - - - + + + + + + + + + + @@ -245,7 +252,7 @@ Implementation-Vendor: Lucene includes="**/*.java" destdir="${build.demo.classes}" debug="${debug}"> - + @@ -255,23 +262,14 @@ Implementation-Vendor: Lucene - - - - - - - - - - + @@ -295,7 +293,7 @@ Implementation-Vendor: Lucene includes="**/*.java" destdir="${junit.classes}" debug="${debug}"> - + @@ -565,7 +563,7 @@ Implementation-Vendor: Lucene - + diff --git a/default.properties b/default.properties index 85e633fceee..7b08e2547a2 100644 --- a/default.properties +++ b/default.properties @@ -50,7 +50,6 @@ build.docweb = ${build.dir}/docweb build.docweb.war.name = lucenedocweb build.test = ${build.dir}/test -build.test.src = ${build.test}/src build.test.classes = ${build.test}/classes junit.src = ${basedir}/src/test diff --git a/src/java/org/apache/lucene/analysis/ru/package.html b/src/java/org/apache/lucene/analysis/ru/package.html new file mode 100644 index 00000000000..c63920a9bfa --- /dev/null +++ b/src/java/org/apache/lucene/analysis/ru/package.html @@ -0,0 +1,5 @@ + + +Support for indexing and searching Russian text. + + diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index c9f65bdb7a2..3d478580460 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -85,13 +85,13 @@ public final class Field implements java.io.Serializable { *

The boost is multiplied by {@link Document#getBoost()} of the document * containing this field. If a document has multiple fields with the same * name, all such values are multiplied together. This product is then - * multipled by the value {@link Similarity#normalizeLength(int)}, and + * multipled by the value {@link Similarity#lengthNorm(String,int)}, and * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * * @see Document#setBoost(float) - * @see Similarity#normalizeLength(int) + * @see Similarity#lengthNorm(String, int) * @see Similarity#encodeNorm(float) */ public void setBoost(float boost) { diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index 79167e90c9d..d0b695dfe35 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -73,13 +73,16 @@ import org.apache.lucene.search.Similarity; final class DocumentWriter { private Analyzer analyzer; private Directory directory; + private Similarity similarity; private FieldInfos fieldInfos; private int maxFieldLength; - - DocumentWriter(Directory d, Analyzer a, int mfl) { - directory = d; - analyzer = a; - maxFieldLength = mfl; + + DocumentWriter(Directory directory, Analyzer analyzer, + Similarity similarity, int maxFieldLength) { + this.directory = directory; + this.analyzer = analyzer; + this.similarity = similarity; + this.maxFieldLength = maxFieldLength; } final void addDocument(String segment, Document doc) @@ -320,10 +323,10 @@ final class DocumentWriter { if (field.isIndexed()) { int n = fieldInfos.fieldNumber(field.name()); float norm = - fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]); + fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]); OutputStream norms = directory.createFile(segment + ".f" + n); try { - norms.writeByte(Similarity.encodeNorm(norm)); + norms.writeByte(similarity.encodeNorm(norm)); } finally { norms.close(); } diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index 0238a42211c..846776db5b3 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -68,6 +68,8 @@ import org.apache.lucene.store.OutputStream; import org.apache.lucene.search.Similarity; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Similarity; + /** An IndexWriter creates and maintains an index. @@ -89,12 +91,28 @@ public class IndexWriter { private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text + private Similarity similarity = Similarity.getDefault(); // how to normalize + private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private final Directory ramDirectory = new RAMDirectory(); // for temp segs private Lock writeLock; - private Similarity similarity; + /** Expert: Set the Similarity implementation used by this IndexWriter. + * + * @see Similarity#setDefault(Similarity) + */ + public void setSimilarity(Similarity similarity) { + this.similarity = similarity; + } + + /** Expert: Return the Similarity implementation used by this IndexWriter. + * + *

This defaults to the current value of {@link Similarity#getDefault()}. + */ + public Similarity getSimilarity() { + return this.similarity; + } /** Constructs an IndexWriter for the index in path. Text will be analyzed with a. If create is true, then a @@ -186,7 +204,7 @@ public class IndexWriter { /** Adds a document to this index.*/ public void addDocument(Document doc) throws IOException { DocumentWriter dw = - new DocumentWriter(ramDirectory, analyzer, maxFieldLength); + new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength); String segmentName = newSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { @@ -407,13 +425,4 @@ public class IndexWriter { } directory.renameFile("deleteable.new", "deletable"); } - - /** - * Sets the Similarity implementation to use. - * - * @param sim an instance of a class that implements Similarity. + */ + +import org.apache.lucene.document.Document; + +/** Expert: Default scoring implementation. */ +public class DefaultSimilarity extends Similarity { + /** Implemented as 1/sqrt(numTerms). */ + public float lengthNorm(String fieldName, int numTerms) { + return (float)(1.0 / Math.sqrt(numTerms)); + } + + /** Implemented as 1/sqrt(sumOfSquaredWeights). */ + public float queryNorm(float sumOfSquaredWeights) { + return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); + } + + /** Implemented as sqrt(freq). */ + public float tf(float freq) { + return (float)Math.sqrt(freq); + } + + /** Implemented as 1 / (distance + 1). */ + public float sloppyFreq(int distance) { + return 1.0f / (distance + 1); + } + + /** Implemented as log(numDocs/(docFreq+1)) + 1. */ + public float idf(int docFreq, int numDocs) { + return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); + } + + /** Implemented as overlap / maxOverlap. */ + public float coord(int overlap, int maxOverlap) { + return overlap / (float)maxOverlap; + } +} diff --git a/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/src/java/org/apache/lucene/search/ExactPhraseScorer.java index 46c590fd68c..c33c5c59ba3 100644 --- a/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -61,9 +61,9 @@ import org.apache.lucene.index.*; final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(TermPositions[] tps, byte[] n, float w) - throws IOException { - super(tps, n, w); + ExactPhraseScorer(TermPositions[] tps, Similarity similarity, + byte[] norms, float weight) throws IOException { + super(tps, similarity, norms, weight); } protected final float phraseFreq() throws IOException { diff --git a/src/java/org/apache/lucene/search/MultiTermQuery.java b/src/java/org/apache/lucene/search/MultiTermQuery.java index 668f87a0118..776acc97793 100644 --- a/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -85,7 +85,6 @@ public class MultiTermQuery extends Query { /** Constructs a query for terms matching term. */ public MultiTermQuery(Term term) { this.term = term; - this.query = query; } /** Set the TermEnum to be used */ @@ -105,8 +104,9 @@ public class MultiTermQuery extends Query { } } - final Scorer scorer(IndexReader reader) throws IOException { - return getQuery().scorer(reader); + final Scorer scorer(IndexReader reader, Similarity similarity) + throws IOException { + return getQuery().scorer(reader, similarity); } private final BooleanQuery getQuery() throws IOException { diff --git a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java index 7a9f94a7934..36a85539e48 100644 --- a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java +++ b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java @@ -147,7 +147,7 @@ public class PhrasePrefixQuery _termArrays.add(terms); } - Scorer scorer(IndexReader reader) + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { if (_termArrays.size() == 0) // optimize zero-term case @@ -161,7 +161,7 @@ public class PhrasePrefixQuery for (int i=0; i 0.0) { - float score = Similarity.tf(freq)*weight; // compute score + float score = similarity.tf(freq)*weight; // compute score score *= Similarity.decodeNorm(norms[first.doc]); // normalize results.collect(first.doc, score); // add to results } diff --git a/src/java/org/apache/lucene/search/PrefixQuery.java b/src/java/org/apache/lucene/search/PrefixQuery.java index 4525f26347b..cc5f63d71b8 100644 --- a/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/src/java/org/apache/lucene/search/PrefixQuery.java @@ -90,8 +90,8 @@ public class PrefixQuery extends Query { } } - Scorer scorer(IndexReader reader) throws IOException { - return getQuery().scorer(reader); + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { + return getQuery().scorer(reader, similarity); } private BooleanQuery getQuery() throws IOException { diff --git a/src/java/org/apache/lucene/search/Query.java b/src/java/org/apache/lucene/search/Query.java index 88b50200c05..b3b84727e5c 100644 --- a/src/java/org/apache/lucene/search/Query.java +++ b/src/java/org/apache/lucene/search/Query.java @@ -86,18 +86,19 @@ public abstract class Query implements java.io.Serializable abstract void normalize(float norm); // query evaluation - abstract Scorer scorer(IndexReader reader) throws IOException; + abstract Scorer scorer(IndexReader reader, Similarity similarity) + throws IOException; void prepare(IndexReader reader) {} static Scorer scorer(Query query, Searcher searcher, IndexReader reader) - throws IOException - { - query.prepare(reader); - float sum = query.sumOfSquaredWeights(searcher); - float norm = 1.0f / (float)Math.sqrt(sum); - query.normalize(norm); - return query.scorer(reader); + throws IOException { + Similarity similarity = searcher.getSimilarity(); + query.prepare(reader); + float sum = query.sumOfSquaredWeights(searcher); + float norm = similarity.queryNorm(sum); + query.normalize(norm); + return query.scorer(reader, similarity); } /** diff --git a/src/java/org/apache/lucene/search/RangeQuery.java b/src/java/org/apache/lucene/search/RangeQuery.java index dbeec4a4568..277e174d289 100644 --- a/src/java/org/apache/lucene/search/RangeQuery.java +++ b/src/java/org/apache/lucene/search/RangeQuery.java @@ -113,9 +113,9 @@ public class RangeQuery extends Query } } - Scorer scorer(IndexReader reader) throws IOException + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { - return getQuery().scorer(reader); + return getQuery().scorer(reader, similarity); } private BooleanQuery getQuery() throws IOException diff --git a/src/java/org/apache/lucene/search/Scorer.java b/src/java/org/apache/lucene/search/Scorer.java index 863a447c7d9..bd04dfb2580 100644 --- a/src/java/org/apache/lucene/search/Scorer.java +++ b/src/java/org/apache/lucene/search/Scorer.java @@ -57,5 +57,15 @@ package org.apache.lucene.search; import java.io.IOException; abstract class Scorer { + private Similarity similarity; + + protected Scorer(Similarity similarity) { + this.similarity = similarity; + } + + public Similarity getSimilarity() { + return this.similarity; + } + abstract void score(HitCollector hc, int maxDoc) throws IOException; } diff --git a/src/java/org/apache/lucene/search/Searcher.java b/src/java/org/apache/lucene/search/Searcher.java index d9d1b18f41d..5cf9a3e33ed 100644 --- a/src/java/org/apache/lucene/search/Searcher.java +++ b/src/java/org/apache/lucene/search/Searcher.java @@ -63,9 +63,6 @@ import org.apache.lucene.index.IndexReader; * Implements some common utility methods. */ public abstract class Searcher implements Searchable { - - protected Similarity similarity; - /** Returns the documents matching query. */ public final Hits search(Query query) throws IOException { return search(query, (Filter)null); @@ -91,12 +88,22 @@ public abstract class Searcher implements Searchable { search(query, (Filter)null, results); } - /** - * Sets the Similarity implementation to use. + /** The Similarity implementation used by this searcher. */ + private Similarity similarity = Similarity.getDefault(); + + /** Expert: Set the Similarity implementation used by this Searcher. * - * @param sim an instance of a class that implements SimilarityThis defaults to the current value of {@link Similarity#getDefault()}. + */ + public Similarity getSimilarity() { + return this.similarity; } } diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java index c525bc32e3e..459615f2943 100644 --- a/src/java/org/apache/lucene/search/Similarity.java +++ b/src/java/org/apache/lucene/search/Similarity.java @@ -55,14 +55,73 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.Vector; import org.apache.lucene.index.Term; +import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; -/** Internal class used for scoring. - *

Public only so that the indexing code can compute and store the - * normalization byte for each document. */ +/** Expert: Scoring API. + *

Subclasses implement search scoring. + * + *

The score of query q for document d is defined + * in terms of these methods as follows: + * + * + * + * + * + * + * + * + * + * + * + *
score(q,d) =
+ * Σ + * {@link #tf(int) tf}(t in d) * + * {@link #idf(Term,Searcher) idf}(t) * + * {@link Field#getBoost getBoost}(t.field in d) * + * {@link #lengthNorm(String,int) lengthNorm}(t.field in d) + *  * + * {@link #coord(int,int) coord}(q,d) * + * {@link #queryNorm(float) queryNorm}(q) + *
+ * t in q + *
+ * + * @see #setDefault(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + * @see Searcher#setSimilarity(Similarity) + */ public abstract class Similarity { + /** The Similarity implementation used by default. */ + private static Similarity defaultImpl = new DefaultSimilarity(); + /** Set the default Similarity implementation used by indexing and search + * code. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + public static void setDefault(Similarity similarity) { + Similarity.defaultImpl = similarity; + } + + /** Return the default Similarity implementation used by indexing and search + * code. + * + *

This is initially an instance of {@link DefaultSimilarity}. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + public static Similarity getDefault() { + return Similarity.defaultImpl; + } + + /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; static { @@ -70,31 +129,6 @@ public abstract class Similarity { NORM_TABLE[i] = byteToFloat((byte)i); } - private static Similarity similarity; - - private Similarity() {} // no public constructor - - /** - * Sets the Similarity implementation to use. - * - * @param sim an instance of a class that implements SimilarityThe formula used is: 1.0f / Math.sqrt(numTerms) - * - * @see Field#setBoost(float) - */ - public static float normalizeLength(int numTerms) { - return (float)(1.0 / Math.sqrt(numTerms)); - } - /** Decodes a normalization factor stored in an index. * @see #encodeNorm(float) */ @@ -102,6 +136,41 @@ public abstract class Similarity { return NORM_TABLE[b & 0xFF]; } + /** Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + *

Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when numTokens is large, + * and larger values when numTokens is small. + * + *

That these values are computed under {@link + * IndexWriter#addDocument(Document)} and stored then using + * {#encodeNorm(float)}. Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * @param fieldName the name of the field + * @param numTokens the total number of tokens contained in fields named + * fieldName of doc. + * @return a normalization factor for hits on this field of this document + * + * @see Field#setBoost(float) + */ + public abstract float lengthNorm(String fieldName, int numTokens); + + /** Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + *

This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * @param sumOfSquaredWeights the sum of the squares of query term weights + * @return a normalization factor for query weights + */ + public abstract float queryNorm(float sumOfSquaredWeights); + /** Encodes a normalization factor for storage in an index. * *

The encoding uses a five-bit exponent and three-bit mantissa, thus @@ -151,25 +220,118 @@ public abstract class Similarity { return (byte)((exponent << 3) | mantissa); // pack into a byte } - static final float tf(int freq) { - return (float)Math.sqrt(freq); + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implemenations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + *

The default implementation calls {@link #tf(float)}. + * + * @param tf the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public float tf(int freq) { + return tf((float)freq); } - static final float tf(float freq) { - return (float)Math.sqrt(freq); - } + /** Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to {@link #tf(float)}. + * + *

A phrase match with a small edit distance to a document passage more + * closely matches the document, so implemenations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * @see PhraseQuery#setSlop(int) + * @param distance the edit distance of this sloppy phrase match + * @return the frequency increment for this match + */ + public abstract float sloppyFreq(int distance); + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implemenations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + * @param tf the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public abstract float tf(float freq); - static final float idf(Term term, Searcher searcher) throws IOException { - // Use maxDoc() instead of numDocs() because its proportional to docFreq(), - // i.e., when one is inaccurate, so is the other, and in the same way. + /** Computes a score factor for a simple term. + * + *

The default implementation is:

+   *   return idf(searcher.docFreq(term), searcher.maxDoc());
+   * 
+ * + * Note that {@link Searcher#maxDoc()} is used instead of {@link + * IndexReader#numDocs()} because it is proportional to {@link + * Searcher#docFreq(Term)} , i.e., when one is inaccurate, so is the other, + * and in the same direction. + * + * @param term the term in question + * @param searcher the document collection being searched + * @return a score factor for the term + */ + public float idf(Term term, Searcher searcher) throws IOException { return idf(searcher.docFreq(term), searcher.maxDoc()); } - static final float idf(int docFreq, int numDocs) { - return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); + /** Computes a score factor for a phrase. + * + *

The default implementation sums the {@link #idf(Term,Searcher)} factor + * for each term in the phrase. + * + * @param terms the vector of terms in the phrase + * @param searcher the document collection being searched + * @return a score factor for the phrase + */ + public float idf(Vector terms, Searcher searcher) throws IOException { + float idf = 0.0f; + for (int i = 0; i < terms.size(); i++) { + idf += idf((Term)terms.elementAt(i), searcher); + } + return idf; } + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + *

Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + protected abstract float idf(int docFreq, int numDocs); - static final float coord(int overlap, int maxOverlap) { - return overlap / (float)maxOverlap; - } + /** Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + *

The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * @param overlap the number of query terms matched in the document + * @param maxOverlap the total number of terms in the query + * @return a score factor based on term overlap with the query + */ + public abstract float coord(int overlap, int maxOverlap); } diff --git a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 74bec4a5343..c3afa75b485 100644 --- a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -62,10 +62,10 @@ import org.apache.lucene.index.*; final class SloppyPhraseScorer extends PhraseScorer { private int slop; - SloppyPhraseScorer(TermPositions[] tps, int s, byte[] n, float w) - throws IOException { - super(tps, n, w); - slop = s; + SloppyPhraseScorer(TermPositions[] tps, Similarity similarity, + int slop, byte[] norms, float weight) throws IOException { + super(tps, similarity, norms, weight); + this.slop = slop; } protected final float phraseFreq() throws IOException { @@ -94,7 +94,7 @@ final class SloppyPhraseScorer extends PhraseScorer { int matchLength = end - start; if (matchLength <= slop) - freq += 1.0 / (matchLength + 1); // penalize longer matches + freq += getSimilarity().sloppyFreq(matchLength); // score match if (pp.position > end) end = pp.position; diff --git a/src/java/org/apache/lucene/search/TermQuery.java b/src/java/org/apache/lucene/search/TermQuery.java index 2fd54bc80c8..3e666a49764 100644 --- a/src/java/org/apache/lucene/search/TermQuery.java +++ b/src/java/org/apache/lucene/search/TermQuery.java @@ -73,7 +73,7 @@ public class TermQuery extends Query { } final float sumOfSquaredWeights(Searcher searcher) throws IOException { - idf = Similarity.idf(term, searcher); + idf = searcher.getSimilarity().idf(term, searcher); weight = idf * boost; return weight * weight; // square term weights } @@ -83,14 +83,15 @@ public class TermQuery extends Query { weight *= idf; // factor from document } - Scorer scorer(IndexReader reader) + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; - return new TermScorer(termDocs, reader.norms(term.field()), weight); + return new TermScorer(termDocs, similarity, + reader.norms(term.field()), weight); } /** Prints a user-readable version of this query. */ diff --git a/src/java/org/apache/lucene/search/TermScorer.java b/src/java/org/apache/lucene/search/TermScorer.java index 76637131873..7582c66e7ec 100644 --- a/src/java/org/apache/lucene/search/TermScorer.java +++ b/src/java/org/apache/lucene/search/TermScorer.java @@ -63,21 +63,23 @@ final class TermScorer extends Scorer { private float weight; private int doc; - private final int[] docs = new int[128]; // buffered doc numbers - private final int[] freqs = new int[128]; // buffered term freqs + private final int[] docs = new int[32]; // buffered doc numbers + private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; - TermScorer(TermDocs td, byte[] n, float w) throws IOException { - termDocs = td; - norms = n; - weight = w; + TermScorer(TermDocs td, Similarity similarity, byte[] norms, float weight) + throws IOException { + super(similarity); + this.termDocs = td; + this.norms = norms; + this.weight = weight; for (int i = 0; i < SCORE_CACHE_SIZE; i++) - scoreCache[i] = Similarity.tf(i) * weight; + scoreCache[i] = getSimilarity().tf(i) * weight; pointerMax = termDocs.read(docs, freqs); // fill buffers @@ -91,12 +93,13 @@ final class TermScorer extends Scorer { final void score(HitCollector c, final int end) throws IOException { int d = doc; // cache doc in local + Similarity similarity = getSimilarity(); // cache sim in local while (d < end) { // for docs in window final int f = freqs[pointer]; float score = // compute tf(f)*weight f < SCORE_CACHE_SIZE // check cache ? scoreCache[f] // cache hit - : Similarity.tf(f)*weight; // cache miss + : similarity.tf(f)*weight; // cache miss score *= Similarity.decodeNorm(norms[d]); // normalize for field diff --git a/src/test/org/apache/lucene/index/DocTest.java b/src/test/org/apache/lucene/index/DocTest.java index b5f9116f992..ffa5ae09cc2 100644 --- a/src/test/org/apache/lucene/index/DocTest.java +++ b/src/test/org/apache/lucene/index/DocTest.java @@ -59,6 +59,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.document.Document; +import org.apache.lucene.search.Similarity; import org.apache.lucene.demo.FileDocument; import java.io.File; @@ -95,7 +96,8 @@ class DocTest { throws Exception { Directory directory = FSDirectory.getDirectory("test", false); Analyzer analyzer = new SimpleAnalyzer(); - DocumentWriter writer = new DocumentWriter(directory, analyzer, 1000); + DocumentWriter writer = + new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000); File file = new File(fileName); Document doc = FileDocument.Document(file); diff --git a/src/test/org/apache/lucene/search/TestDocBoost.java b/src/test/org/apache/lucene/search/TestDocBoost.java index c9e2c1a8d85..7ed8d3b41b2 100644 --- a/src/test/org/apache/lucene/search/TestDocBoost.java +++ b/src/test/org/apache/lucene/search/TestDocBoost.java @@ -76,7 +76,7 @@ public class TestDocBoost extends TestCase { super(name); } - public static void test() throws Exception { + public void testDocBoost() throws Exception { RAMDirectory store = new RAMDirectory(); IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); diff --git a/src/test/org/apache/lucene/search/TestSimilarity.java b/src/test/org/apache/lucene/search/TestSimilarity.java new file mode 100644 index 00000000000..b095def7daa --- /dev/null +++ b/src/test/org/apache/lucene/search/TestSimilarity.java @@ -0,0 +1,161 @@ +package org.apache.lucene.search; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import junit.framework.TestCase; + +import java.util.Vector; + + /** Similarity unit test. + * + * @author Doug Cutting + * @version $Revision$ + */ +public class TestSimilarity extends TestCase { + public TestSimilarity(String name) { + super(name); + } + + public static class SimpleSimilarity extends Similarity { + public float lengthNorm(String field, int numTerms) { return 1.0f; } + public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } + public float tf(float freq) { return freq; } + public float sloppyFreq(int distance) { return 2.0f; } + public float idf(Vector terms, Searcher searcher) { return 1.0f; } + public float idf(int docFreq, int numDocs) { return 1.0f; } + public float coord(int overlap, int maxOverlap) { return 1.0f; } + } + + public void testSimilarity() throws Exception { + RAMDirectory store = new RAMDirectory(); + IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); + writer.setSimilarity(new SimpleSimilarity()); + + Document d1 = new Document(); + d1.add(Field.Text("field", "a c")); + + Document d2 = new Document(); + d2.add(Field.Text("field", "a b c")); + + writer.addDocument(d1); + writer.addDocument(d2); + writer.optimize(); + writer.close(); + + final float[] scores = new float[4]; + + Searcher searcher = new IndexSearcher(store); + searcher.setSimilarity(new SimpleSimilarity()); + + Term a = new Term("field", "a"); + Term b = new Term("field", "b"); + Term c = new Term("field", "c"); + + searcher.search + (new TermQuery(b), + new HitCollector() { + public final void collect(int doc, float score) { + assertTrue(score == 1.0f); + } + }); + + BooleanQuery bq = new BooleanQuery(); + bq.add(new TermQuery(a), false, false); + bq.add(new TermQuery(b), false, false); + //System.out.println(bq.toString("field")); + searcher.search + (bq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == (float)doc+1); + } + }); + + PhraseQuery pq = new PhraseQuery(); + pq.add(a); + pq.add(c); + //System.out.println(pq.toString("field")); + searcher.search + (pq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == 1.0f); + } + }); + + pq.setSlop(2); + //System.out.println(pq.toString("field")); + searcher.search + (pq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == 2.0f); + } + }); + } +}