mirror of https://github.com/apache/lucene.git
Added a public, extensible scoring API.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149885 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
98330b5030
commit
6772e7567d
|
@ -93,6 +93,9 @@ $Id$
|
||||||
17. Added Russian Analyzer.
|
17. Added Russian Analyzer.
|
||||||
(Boris Okner via otis)
|
(Boris Okner via otis)
|
||||||
|
|
||||||
|
18. Added a public, extensible scoring API. For details, see the
|
||||||
|
javadoc for org.apache.lucene.search.Similarity.
|
||||||
|
|
||||||
|
|
||||||
1.2 RC6
|
1.2 RC6
|
||||||
|
|
||||||
|
|
32
build.xml
32
build.xml
|
@ -12,14 +12,21 @@
|
||||||
<!-- Build classpath -->
|
<!-- Build classpath -->
|
||||||
<path id="classpath">
|
<path id="classpath">
|
||||||
<pathelement location="${build.classes}"/>
|
<pathelement location="${build.classes}"/>
|
||||||
<pathelement location="${build.demo.classes}"/>
|
|
||||||
<pathelement location="${build.test.classes}"/>
|
|
||||||
<pathelement location="."/>
|
|
||||||
<fileset dir="lib">
|
<fileset dir="lib">
|
||||||
<include name="*.jar" />
|
<include name="*.jar" />
|
||||||
</fileset>
|
</fileset>
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
|
<path id="demo.classpath">
|
||||||
|
<path refid="classpath"/>
|
||||||
|
<pathelement location="${build.demo.classes}"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<path refid="demo.classpath"/>
|
||||||
|
<pathelement location="${build.test.classes}"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
<path id="junit.classpath">
|
<path id="junit.classpath">
|
||||||
<pathelement location="${junit.classes}" />
|
<pathelement location="${junit.classes}" />
|
||||||
<pathelement location="${build.classes}"/>
|
<pathelement location="${build.classes}"/>
|
||||||
|
@ -245,7 +252,7 @@ Implementation-Vendor: Lucene
|
||||||
includes="**/*.java"
|
includes="**/*.java"
|
||||||
destdir="${build.demo.classes}"
|
destdir="${build.demo.classes}"
|
||||||
debug="${debug}">
|
debug="${debug}">
|
||||||
<classpath refid="classpath"/>
|
<classpath refid="demo.classpath"/>
|
||||||
</javac>
|
</javac>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
@ -255,23 +262,14 @@ Implementation-Vendor: Lucene
|
||||||
<!-- -->
|
<!-- -->
|
||||||
<!-- ================================================================== -->
|
<!-- ================================================================== -->
|
||||||
<target name="test" depends="compile,demo">
|
<target name="test" depends="compile,demo">
|
||||||
<mkdir dir="${build.test}"/>
|
|
||||||
|
|
||||||
<copy todir="${build.test.src}">
|
|
||||||
<fileset dir="${test.src}">
|
|
||||||
<include name="**/*.java"/>
|
|
||||||
</fileset>
|
|
||||||
</copy>
|
|
||||||
|
|
||||||
<mkdir dir="${build.test.classes}"/>
|
<mkdir dir="${build.test.classes}"/>
|
||||||
|
|
||||||
<javac
|
<javac
|
||||||
encoding="${build.encoding}"
|
encoding="${build.encoding}"
|
||||||
srcdir="${build.test.src}"
|
srcdir="${test.src}"
|
||||||
includes="**/*.java"
|
includes="**/*.java"
|
||||||
destdir="${build.test.classes}"
|
destdir="${build.test.classes}"
|
||||||
debug="${debug}">
|
debug="${debug}">
|
||||||
<classpath refid="classpath"/>
|
<classpath refid="test.classpath"/>
|
||||||
</javac>
|
</javac>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
@ -295,7 +293,7 @@ Implementation-Vendor: Lucene
|
||||||
includes="**/*.java"
|
includes="**/*.java"
|
||||||
destdir="${junit.classes}"
|
destdir="${junit.classes}"
|
||||||
debug="${debug}">
|
debug="${debug}">
|
||||||
<classpath refid="classpath"/>
|
<classpath refid="test.classpath"/>
|
||||||
</javac>
|
</javac>
|
||||||
|
|
||||||
<junit printsummary="yes" haltonfailure="no" >
|
<junit printsummary="yes" haltonfailure="no" >
|
||||||
|
@ -565,7 +563,7 @@ Implementation-Vendor: Lucene
|
||||||
<!-- ================================================================== -->
|
<!-- ================================================================== -->
|
||||||
<!-- -->
|
<!-- -->
|
||||||
<!-- ================================================================== -->
|
<!-- ================================================================== -->
|
||||||
<target name="clean" depends="init">
|
<target name="clean">
|
||||||
<delete dir="${build.dir}"/>
|
<delete dir="${build.dir}"/>
|
||||||
<delete dir="${dist.dir}"/>
|
<delete dir="${dist.dir}"/>
|
||||||
<delete file="${basedir}/${final.name}.tar"/>
|
<delete file="${basedir}/${final.name}.tar"/>
|
||||||
|
|
|
@ -50,7 +50,6 @@ build.docweb = ${build.dir}/docweb
|
||||||
build.docweb.war.name = lucenedocweb
|
build.docweb.war.name = lucenedocweb
|
||||||
|
|
||||||
build.test = ${build.dir}/test
|
build.test = ${build.dir}/test
|
||||||
build.test.src = ${build.test}/src
|
|
||||||
build.test.classes = ${build.test}/classes
|
build.test.classes = ${build.test}/classes
|
||||||
|
|
||||||
junit.src = ${basedir}/src/test
|
junit.src = ${basedir}/src/test
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Support for indexing and searching Russian text.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -85,13 +85,13 @@ public final class Field implements java.io.Serializable {
|
||||||
* <p>The boost is multiplied by {@link Document#getBoost()} of the document
|
* <p>The boost is multiplied by {@link Document#getBoost()} of the document
|
||||||
* containing this field. If a document has multiple fields with the same
|
* containing this field. If a document has multiple fields with the same
|
||||||
* name, all such values are multiplied together. This product is then
|
* name, all such values are multiplied together. This product is then
|
||||||
* multipled by the value {@link Similarity#normalizeLength(int)}, and
|
* multipled by the value {@link Similarity#lengthNorm(String,int)}, and
|
||||||
* rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
|
* rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
|
||||||
* index. One should attempt to ensure that this product does not overflow
|
* index. One should attempt to ensure that this product does not overflow
|
||||||
* the range of that encoding.
|
* the range of that encoding.
|
||||||
*
|
*
|
||||||
* @see Document#setBoost(float)
|
* @see Document#setBoost(float)
|
||||||
* @see Similarity#normalizeLength(int)
|
* @see Similarity#lengthNorm(String, int)
|
||||||
* @see Similarity#encodeNorm(float)
|
* @see Similarity#encodeNorm(float)
|
||||||
*/
|
*/
|
||||||
public void setBoost(float boost) {
|
public void setBoost(float boost) {
|
||||||
|
|
|
@ -73,13 +73,16 @@ import org.apache.lucene.search.Similarity;
|
||||||
final class DocumentWriter {
|
final class DocumentWriter {
|
||||||
private Analyzer analyzer;
|
private Analyzer analyzer;
|
||||||
private Directory directory;
|
private Directory directory;
|
||||||
|
private Similarity similarity;
|
||||||
private FieldInfos fieldInfos;
|
private FieldInfos fieldInfos;
|
||||||
private int maxFieldLength;
|
private int maxFieldLength;
|
||||||
|
|
||||||
DocumentWriter(Directory d, Analyzer a, int mfl) {
|
DocumentWriter(Directory directory, Analyzer analyzer,
|
||||||
directory = d;
|
Similarity similarity, int maxFieldLength) {
|
||||||
analyzer = a;
|
this.directory = directory;
|
||||||
maxFieldLength = mfl;
|
this.analyzer = analyzer;
|
||||||
|
this.similarity = similarity;
|
||||||
|
this.maxFieldLength = maxFieldLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
final void addDocument(String segment, Document doc)
|
final void addDocument(String segment, Document doc)
|
||||||
|
@ -320,10 +323,10 @@ final class DocumentWriter {
|
||||||
if (field.isIndexed()) {
|
if (field.isIndexed()) {
|
||||||
int n = fieldInfos.fieldNumber(field.name());
|
int n = fieldInfos.fieldNumber(field.name());
|
||||||
float norm =
|
float norm =
|
||||||
fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]);
|
fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
|
||||||
OutputStream norms = directory.createFile(segment + ".f" + n);
|
OutputStream norms = directory.createFile(segment + ".f" + n);
|
||||||
try {
|
try {
|
||||||
norms.writeByte(Similarity.encodeNorm(norm));
|
norms.writeByte(similarity.encodeNorm(norm));
|
||||||
} finally {
|
} finally {
|
||||||
norms.close();
|
norms.close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,6 +68,8 @@ import org.apache.lucene.store.OutputStream;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
An IndexWriter creates and maintains an index.
|
An IndexWriter creates and maintains an index.
|
||||||
|
@ -89,12 +91,28 @@ public class IndexWriter {
|
||||||
private Directory directory; // where this index resides
|
private Directory directory; // where this index resides
|
||||||
private Analyzer analyzer; // how to analyze text
|
private Analyzer analyzer; // how to analyze text
|
||||||
|
|
||||||
|
private Similarity similarity = Similarity.getDefault(); // how to normalize
|
||||||
|
|
||||||
private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
|
private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
|
||||||
private final Directory ramDirectory = new RAMDirectory(); // for temp segs
|
private final Directory ramDirectory = new RAMDirectory(); // for temp segs
|
||||||
|
|
||||||
private Lock writeLock;
|
private Lock writeLock;
|
||||||
|
|
||||||
private Similarity similarity;
|
/** Expert: Set the Similarity implementation used by this IndexWriter.
|
||||||
|
*
|
||||||
|
* @see Similarity#setDefault(Similarity)
|
||||||
|
*/
|
||||||
|
public void setSimilarity(Similarity similarity) {
|
||||||
|
this.similarity = similarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Expert: Return the Similarity implementation used by this IndexWriter.
|
||||||
|
*
|
||||||
|
* <p>This defaults to the current value of {@link Similarity#getDefault()}.
|
||||||
|
*/
|
||||||
|
public Similarity getSimilarity() {
|
||||||
|
return this.similarity;
|
||||||
|
}
|
||||||
|
|
||||||
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
|
/** Constructs an IndexWriter for the index in <code>path</code>. Text will
|
||||||
be analyzed with <code>a</code>. If <code>create</code> is true, then a
|
be analyzed with <code>a</code>. If <code>create</code> is true, then a
|
||||||
|
@ -186,7 +204,7 @@ public class IndexWriter {
|
||||||
/** Adds a document to this index.*/
|
/** Adds a document to this index.*/
|
||||||
public void addDocument(Document doc) throws IOException {
|
public void addDocument(Document doc) throws IOException {
|
||||||
DocumentWriter dw =
|
DocumentWriter dw =
|
||||||
new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
|
new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
|
||||||
String segmentName = newSegmentName();
|
String segmentName = newSegmentName();
|
||||||
dw.addDocument(segmentName, doc);
|
dw.addDocument(segmentName, doc);
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
|
@ -407,13 +425,4 @@ public class IndexWriter {
|
||||||
}
|
}
|
||||||
directory.renameFile("deleteable.new", "deletable");
|
directory.renameFile("deleteable.new", "deletable");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the <code>Similarity</code> implementation to use.
|
|
||||||
*
|
|
||||||
* @param sim an instance of a class that implements <code>Similarity</code
|
|
||||||
*/
|
|
||||||
public void setSimilarity(Similarity sim) {
|
|
||||||
similarity = sim;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -116,20 +116,20 @@ public class BooleanQuery extends Query {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer(IndexReader reader)
|
Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
if (clauses.size() == 1) { // optimize 1-term queries
|
if (clauses.size() == 1) { // optimize 1-term queries
|
||||||
BooleanClause c = (BooleanClause)clauses.elementAt(0);
|
BooleanClause c = (BooleanClause)clauses.elementAt(0);
|
||||||
if (!c.prohibited) // just return term scorer
|
if (!c.prohibited) // just return term scorer
|
||||||
return c.query.scorer(reader);
|
return c.query.scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
BooleanScorer result = new BooleanScorer();
|
BooleanScorer result = new BooleanScorer(similarity);
|
||||||
|
|
||||||
for (int i = 0 ; i < clauses.size(); i++) {
|
for (int i = 0 ; i < clauses.size(); i++) {
|
||||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||||
Scorer subScorer = c.query.scorer(reader);
|
Scorer subScorer = c.query.scorer(reader, similarity);
|
||||||
if (subScorer != null)
|
if (subScorer != null)
|
||||||
result.add(subScorer, c.required, c.prohibited);
|
result.add(subScorer, c.required, c.prohibited);
|
||||||
else if (c.required)
|
else if (c.required)
|
||||||
|
|
|
@ -70,6 +70,10 @@ final class BooleanScorer extends Scorer {
|
||||||
private int prohibitedMask = 0;
|
private int prohibitedMask = 0;
|
||||||
private int nextMask = 1;
|
private int nextMask = 1;
|
||||||
|
|
||||||
|
BooleanScorer(Similarity similarity) {
|
||||||
|
super(similarity);
|
||||||
|
}
|
||||||
|
|
||||||
static final class SubScorer {
|
static final class SubScorer {
|
||||||
public Scorer scorer;
|
public Scorer scorer;
|
||||||
public boolean required = false;
|
public boolean required = false;
|
||||||
|
@ -113,7 +117,7 @@ final class BooleanScorer extends Scorer {
|
||||||
private final void computeCoordFactors() throws IOException {
|
private final void computeCoordFactors() throws IOException {
|
||||||
coordFactors = new float[maxCoord];
|
coordFactors = new float[maxCoord];
|
||||||
for (int i = 0; i < maxCoord; i++)
|
for (int i = 0; i < maxCoord; i++)
|
||||||
coordFactors[i] = Similarity.coord(i, maxCoord);
|
coordFactors[i] = getSimilarity().coord(i, maxCoord);
|
||||||
}
|
}
|
||||||
|
|
||||||
final void score(HitCollector results, int maxDoc) throws IOException {
|
final void score(HitCollector results, int maxDoc) throws IOException {
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
|
||||||
|
/** Expert: Default scoring implementation. */
|
||||||
|
public class DefaultSimilarity extends Similarity {
|
||||||
|
/** Implemented as <code>1/sqrt(numTerms)</code>. */
|
||||||
|
public float lengthNorm(String fieldName, int numTerms) {
|
||||||
|
return (float)(1.0 / Math.sqrt(numTerms));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
|
||||||
|
public float queryNorm(float sumOfSquaredWeights) {
|
||||||
|
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>sqrt(freq)</code>. */
|
||||||
|
public float tf(float freq) {
|
||||||
|
return (float)Math.sqrt(freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||||
|
public float sloppyFreq(int distance) {
|
||||||
|
return 1.0f / (distance + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||||
|
public float idf(int docFreq, int numDocs) {
|
||||||
|
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implemented as <code>overlap / maxOverlap</code>. */
|
||||||
|
public float coord(int overlap, int maxOverlap) {
|
||||||
|
return overlap / (float)maxOverlap;
|
||||||
|
}
|
||||||
|
}
|
|
@ -61,9 +61,9 @@ import org.apache.lucene.index.*;
|
||||||
|
|
||||||
final class ExactPhraseScorer extends PhraseScorer {
|
final class ExactPhraseScorer extends PhraseScorer {
|
||||||
|
|
||||||
ExactPhraseScorer(TermPositions[] tps, byte[] n, float w)
|
ExactPhraseScorer(TermPositions[] tps, Similarity similarity,
|
||||||
throws IOException {
|
byte[] norms, float weight) throws IOException {
|
||||||
super(tps, n, w);
|
super(tps, similarity, norms, weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final float phraseFreq() throws IOException {
|
protected final float phraseFreq() throws IOException {
|
||||||
|
|
|
@ -85,7 +85,6 @@ public class MultiTermQuery extends Query {
|
||||||
/** Constructs a query for terms matching <code>term</code>. */
|
/** Constructs a query for terms matching <code>term</code>. */
|
||||||
public MultiTermQuery(Term term) {
|
public MultiTermQuery(Term term) {
|
||||||
this.term = term;
|
this.term = term;
|
||||||
this.query = query;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Set the TermEnum to be used */
|
/** Set the TermEnum to be used */
|
||||||
|
@ -105,8 +104,9 @@ public class MultiTermQuery extends Query {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Scorer scorer(IndexReader reader) throws IOException {
|
final Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
return getQuery().scorer(reader);
|
throws IOException {
|
||||||
|
return getQuery().scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final BooleanQuery getQuery() throws IOException {
|
private final BooleanQuery getQuery() throws IOException {
|
||||||
|
|
|
@ -147,7 +147,7 @@ public class PhrasePrefixQuery
|
||||||
_termArrays.add(terms);
|
_termArrays.add(terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer(IndexReader reader)
|
Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
if (_termArrays.size() == 0) // optimize zero-term case
|
if (_termArrays.size() == 0) // optimize zero-term case
|
||||||
|
@ -161,7 +161,7 @@ public class PhrasePrefixQuery
|
||||||
for (int i=0; i<terms.length; i++)
|
for (int i=0; i<terms.length; i++)
|
||||||
boq.add(new TermQuery(terms[i]), false, false);
|
boq.add(new TermQuery(terms[i]), false, false);
|
||||||
|
|
||||||
return boq.scorer(reader);
|
return boq.scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
TermPositions[] tps = new TermPositions[_termArrays.size()];
|
TermPositions[] tps = new TermPositions[_termArrays.size()];
|
||||||
|
@ -182,9 +182,11 @@ public class PhrasePrefixQuery
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_slop == 0)
|
if (_slop == 0)
|
||||||
return new ExactPhraseScorer(tps, reader.norms(_field), _weight);
|
return new ExactPhraseScorer(tps, similarity,
|
||||||
|
reader.norms(_field), _weight);
|
||||||
else
|
else
|
||||||
return new SloppyPhraseScorer(tps, _slop, reader.norms(_field), _weight);
|
return new SloppyPhraseScorer(tps, similarity, _slop,
|
||||||
|
reader.norms(_field), _weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
float sumOfSquaredWeights(Searcher searcher)
|
float sumOfSquaredWeights(Searcher searcher)
|
||||||
|
@ -195,7 +197,7 @@ public class PhrasePrefixQuery
|
||||||
{
|
{
|
||||||
Term[] terms = (Term[])i.next();
|
Term[] terms = (Term[])i.next();
|
||||||
for (int j=0; j<terms.length; j++)
|
for (int j=0; j<terms.length; j++)
|
||||||
_idf += Similarity.idf(terms[j], searcher);
|
_idf += searcher.getSimilarity().idf(terms[j], searcher);
|
||||||
}
|
}
|
||||||
|
|
||||||
_weight = _idf * boost;
|
_weight = _idf * boost;
|
||||||
|
|
|
@ -108,10 +108,7 @@ public class PhraseQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
||||||
idf = 0.0f;
|
idf = searcher.getSimilarity().idf(terms, searcher);
|
||||||
for (int i = 0; i < terms.size(); i++) // sum term IDFs
|
|
||||||
idf += Similarity.idf((Term)terms.elementAt(i), searcher);
|
|
||||||
|
|
||||||
weight = idf * boost;
|
weight = idf * boost;
|
||||||
return weight * weight; // square term weights
|
return weight * weight; // square term weights
|
||||||
}
|
}
|
||||||
|
@ -121,7 +118,8 @@ public class PhraseQuery extends Query {
|
||||||
weight *= idf; // factor from document
|
weight *= idf; // factor from document
|
||||||
}
|
}
|
||||||
|
|
||||||
final Scorer scorer(IndexReader reader) throws IOException {
|
final Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
|
throws IOException {
|
||||||
if (terms.size() == 0) // optimize zero-term case
|
if (terms.size() == 0) // optimize zero-term case
|
||||||
return null;
|
return null;
|
||||||
if (terms.size() == 1) { // optimize one-term case
|
if (terms.size() == 1) { // optimize one-term case
|
||||||
|
@ -129,7 +127,8 @@ public class PhraseQuery extends Query {
|
||||||
TermDocs docs = reader.termDocs(term);
|
TermDocs docs = reader.termDocs(term);
|
||||||
if (docs == null)
|
if (docs == null)
|
||||||
return null;
|
return null;
|
||||||
return new TermScorer(docs, reader.norms(term.field()), weight);
|
return new TermScorer(docs, similarity,
|
||||||
|
reader.norms(term.field()), weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
TermPositions[] tps = new TermPositions[terms.size()];
|
TermPositions[] tps = new TermPositions[terms.size()];
|
||||||
|
@ -141,10 +140,12 @@ public class PhraseQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slop == 0) // optimize exact case
|
if (slop == 0) // optimize exact case
|
||||||
return new ExactPhraseScorer(tps, reader.norms(field), weight);
|
return new ExactPhraseScorer(tps, similarity,
|
||||||
|
reader.norms(field), weight);
|
||||||
else
|
else
|
||||||
return
|
return
|
||||||
new SloppyPhraseScorer(tps, slop, reader.norms(field), weight);
|
new SloppyPhraseScorer(tps, similarity, slop,
|
||||||
|
reader.norms(field), weight);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -66,9 +66,11 @@ abstract class PhraseScorer extends Scorer {
|
||||||
protected PhraseQueue pq;
|
protected PhraseQueue pq;
|
||||||
protected PhrasePositions first, last;
|
protected PhrasePositions first, last;
|
||||||
|
|
||||||
PhraseScorer(TermPositions[] tps, byte[] n, float w) throws IOException {
|
PhraseScorer(TermPositions[] tps, Similarity similarity,
|
||||||
norms = n;
|
byte[] norms, float weight) throws IOException {
|
||||||
weight = w;
|
super(similarity);
|
||||||
|
this.norms = norms;
|
||||||
|
this.weight = weight;
|
||||||
|
|
||||||
// use PQ to build a sorted list of PhrasePositions
|
// use PQ to build a sorted list of PhrasePositions
|
||||||
pq = new PhraseQueue(tps.length);
|
pq = new PhraseQueue(tps.length);
|
||||||
|
@ -78,6 +80,7 @@ abstract class PhraseScorer extends Scorer {
|
||||||
}
|
}
|
||||||
|
|
||||||
final void score(HitCollector results, int end) throws IOException {
|
final void score(HitCollector results, int end) throws IOException {
|
||||||
|
Similarity similarity = getSimilarity();
|
||||||
while (last.doc < end) { // find doc w/ all the terms
|
while (last.doc < end) { // find doc w/ all the terms
|
||||||
while (first.doc < last.doc) { // scan forward in first
|
while (first.doc < last.doc) { // scan forward in first
|
||||||
do {
|
do {
|
||||||
|
@ -92,7 +95,7 @@ abstract class PhraseScorer extends Scorer {
|
||||||
float freq = phraseFreq(); // check for phrase
|
float freq = phraseFreq(); // check for phrase
|
||||||
|
|
||||||
if (freq > 0.0) {
|
if (freq > 0.0) {
|
||||||
float score = Similarity.tf(freq)*weight; // compute score
|
float score = similarity.tf(freq)*weight; // compute score
|
||||||
score *= Similarity.decodeNorm(norms[first.doc]); // normalize
|
score *= Similarity.decodeNorm(norms[first.doc]); // normalize
|
||||||
results.collect(first.doc, score); // add to results
|
results.collect(first.doc, score); // add to results
|
||||||
}
|
}
|
||||||
|
|
|
@ -90,8 +90,8 @@ public class PrefixQuery extends Query {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer(IndexReader reader) throws IOException {
|
Scorer scorer(IndexReader reader, Similarity similarity) throws IOException {
|
||||||
return getQuery().scorer(reader);
|
return getQuery().scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
private BooleanQuery getQuery() throws IOException {
|
private BooleanQuery getQuery() throws IOException {
|
||||||
|
|
|
@ -86,18 +86,19 @@ public abstract class Query implements java.io.Serializable
|
||||||
abstract void normalize(float norm);
|
abstract void normalize(float norm);
|
||||||
|
|
||||||
// query evaluation
|
// query evaluation
|
||||||
abstract Scorer scorer(IndexReader reader) throws IOException;
|
abstract Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
void prepare(IndexReader reader) {}
|
void prepare(IndexReader reader) {}
|
||||||
|
|
||||||
static Scorer scorer(Query query, Searcher searcher, IndexReader reader)
|
static Scorer scorer(Query query, Searcher searcher, IndexReader reader)
|
||||||
throws IOException
|
throws IOException {
|
||||||
{
|
Similarity similarity = searcher.getSimilarity();
|
||||||
query.prepare(reader);
|
query.prepare(reader);
|
||||||
float sum = query.sumOfSquaredWeights(searcher);
|
float sum = query.sumOfSquaredWeights(searcher);
|
||||||
float norm = 1.0f / (float)Math.sqrt(sum);
|
float norm = similarity.queryNorm(sum);
|
||||||
query.normalize(norm);
|
query.normalize(norm);
|
||||||
return query.scorer(reader);
|
return query.scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -113,9 +113,9 @@ public class RangeQuery extends Query
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer(IndexReader reader) throws IOException
|
Scorer scorer(IndexReader reader, Similarity similarity) throws IOException
|
||||||
{
|
{
|
||||||
return getQuery().scorer(reader);
|
return getQuery().scorer(reader, similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
private BooleanQuery getQuery() throws IOException
|
private BooleanQuery getQuery() throws IOException
|
||||||
|
|
|
@ -57,5 +57,15 @@ package org.apache.lucene.search;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
abstract class Scorer {
|
abstract class Scorer {
|
||||||
|
private Similarity similarity;
|
||||||
|
|
||||||
|
protected Scorer(Similarity similarity) {
|
||||||
|
this.similarity = similarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Similarity getSimilarity() {
|
||||||
|
return this.similarity;
|
||||||
|
}
|
||||||
|
|
||||||
abstract void score(HitCollector hc, int maxDoc) throws IOException;
|
abstract void score(HitCollector hc, int maxDoc) throws IOException;
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,9 +63,6 @@ import org.apache.lucene.index.IndexReader;
|
||||||
* Implements some common utility methods.
|
* Implements some common utility methods.
|
||||||
*/
|
*/
|
||||||
public abstract class Searcher implements Searchable {
|
public abstract class Searcher implements Searchable {
|
||||||
|
|
||||||
protected Similarity similarity;
|
|
||||||
|
|
||||||
/** Returns the documents matching <code>query</code>. */
|
/** Returns the documents matching <code>query</code>. */
|
||||||
public final Hits search(Query query) throws IOException {
|
public final Hits search(Query query) throws IOException {
|
||||||
return search(query, (Filter)null);
|
return search(query, (Filter)null);
|
||||||
|
@ -91,12 +88,22 @@ public abstract class Searcher implements Searchable {
|
||||||
search(query, (Filter)null, results);
|
search(query, (Filter)null, results);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** The Similarity implementation used by this searcher. */
|
||||||
* Sets the <code>Similarity</code> implementation to use.
|
private Similarity similarity = Similarity.getDefault();
|
||||||
|
|
||||||
|
/** Expert: Set the Similarity implementation used by this Searcher.
|
||||||
*
|
*
|
||||||
* @param sim an instance of a class that implements <code>Similarity</code
|
* @see Similarity#setDefault(Similarity)
|
||||||
*/
|
*/
|
||||||
public void setSimilarity(Similarity sim) {
|
public void setSimilarity(Similarity similarity) {
|
||||||
similarity = sim;
|
this.similarity = similarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Expert: Return the Similarity implementation used by this Searcher.
|
||||||
|
*
|
||||||
|
* <p>This defaults to the current value of {@link Similarity#getDefault()}.
|
||||||
|
*/
|
||||||
|
public Similarity getSimilarity() {
|
||||||
|
return this.similarity;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,14 +55,73 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Vector;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
|
||||||
/** Internal class used for scoring.
|
/** Expert: Scoring API.
|
||||||
* <p>Public only so that the indexing code can compute and store the
|
* <p>Subclasses implement search scoring.
|
||||||
* normalization byte for each document. */
|
*
|
||||||
|
* <p>The score of query <code>q</code> for document <code>d</code> is defined
|
||||||
|
* in terms of these methods as follows:
|
||||||
|
*
|
||||||
|
* <table cellpadding="0" cellspacing="0" border="0">
|
||||||
|
* <tr>
|
||||||
|
* <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
|
||||||
|
* <td valign="middle" align="center">
|
||||||
|
* <big><big><big><big><big>Σ</big></big></big></big></big></td>
|
||||||
|
* <td valign="middle"><small>
|
||||||
|
* {@link #tf(int) tf}(t in d) *
|
||||||
|
* {@link #idf(Term,Searcher) idf}(t) *
|
||||||
|
* {@link Field#getBoost getBoost}(t.field in d) *
|
||||||
|
* {@link #lengthNorm(String,int) lengthNorm}(t.field in d)
|
||||||
|
* </small></td>
|
||||||
|
* <td valign="middle" rowspan="2"> *
|
||||||
|
* {@link #coord(int,int) coord}(q,d) *
|
||||||
|
* {@link #queryNorm(float) queryNorm}(q)
|
||||||
|
* </td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td valign="top" align="right">
|
||||||
|
* <small>t in q</small>
|
||||||
|
* </td>
|
||||||
|
* </tr>
|
||||||
|
* </table>
|
||||||
|
*
|
||||||
|
* @see #setDefault(Similarity)
|
||||||
|
* @see IndexWriter#setSimilarity(Similarity)
|
||||||
|
* @see Searcher#setSimilarity(Similarity)
|
||||||
|
*/
|
||||||
public abstract class Similarity {
|
public abstract class Similarity {
|
||||||
|
/** The Similarity implementation used by default. */
|
||||||
|
private static Similarity defaultImpl = new DefaultSimilarity();
|
||||||
|
|
||||||
|
/** Set the default Similarity implementation used by indexing and search
|
||||||
|
* code.
|
||||||
|
*
|
||||||
|
* @see Searcher#setSimilarity(Similarity)
|
||||||
|
* @see IndexWriter#setSimilarity(Similarity)
|
||||||
|
*/
|
||||||
|
public static void setDefault(Similarity similarity) {
|
||||||
|
Similarity.defaultImpl = similarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the default Similarity implementation used by indexing and search
|
||||||
|
* code.
|
||||||
|
*
|
||||||
|
* <p>This is initially an instance of {@link DefaultSimilarity}.
|
||||||
|
*
|
||||||
|
* @see Searcher#setSimilarity(Similarity)
|
||||||
|
* @see IndexWriter#setSimilarity(Similarity)
|
||||||
|
*/
|
||||||
|
public static Similarity getDefault() {
|
||||||
|
return Similarity.defaultImpl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Cache of decoded bytes. */
|
||||||
private static final float[] NORM_TABLE = new float[256];
|
private static final float[] NORM_TABLE = new float[256];
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -70,31 +129,6 @@ public abstract class Similarity {
|
||||||
NORM_TABLE[i] = byteToFloat((byte)i);
|
NORM_TABLE[i] = byteToFloat((byte)i);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Similarity similarity;
|
|
||||||
|
|
||||||
private Similarity() {} // no public constructor
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the <code>Similarity</code> implementation to use.
|
|
||||||
*
|
|
||||||
* @param sim an instance of a class that implements <code>Similarity</code
|
|
||||||
*/
|
|
||||||
public static void setDefaultSimilarity(Similarity sim) {
|
|
||||||
similarity = sim;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Computes the normalization value for a document given the total number of
|
|
||||||
* terms contained in a field. These values are stored in an index and used
|
|
||||||
* by the search code.
|
|
||||||
*
|
|
||||||
* <p>The formula used is: <code>1.0f / Math.sqrt(numTerms)</code>
|
|
||||||
*
|
|
||||||
* @see Field#setBoost(float)
|
|
||||||
*/
|
|
||||||
public static float normalizeLength(int numTerms) {
|
|
||||||
return (float)(1.0 / Math.sqrt(numTerms));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Decodes a normalization factor stored in an index.
|
/** Decodes a normalization factor stored in an index.
|
||||||
* @see #encodeNorm(float)
|
* @see #encodeNorm(float)
|
||||||
*/
|
*/
|
||||||
|
@ -102,6 +136,41 @@ public abstract class Similarity {
|
||||||
return NORM_TABLE[b & 0xFF];
|
return NORM_TABLE[b & 0xFF];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Computes the normalization value for a field given the total number of
|
||||||
|
* terms contained in a field. These values, together with field boosts, are
|
||||||
|
* stored in an index and multipled into scores for hits on each field by the
|
||||||
|
* search code.
|
||||||
|
*
|
||||||
|
* <p>Matches in longer fields are less precise, so implemenations of this
|
||||||
|
* method usually return smaller values when <code>numTokens</code> is large,
|
||||||
|
* and larger values when <code>numTokens</code> is small.
|
||||||
|
*
|
||||||
|
* <p>That these values are computed under {@link
|
||||||
|
* IndexWriter#addDocument(Document)} and stored then using
|
||||||
|
* {#encodeNorm(float)}. Thus they have limited precision, and documents
|
||||||
|
* must be re-indexed if this method is altered.
|
||||||
|
*
|
||||||
|
* @param fieldName the name of the field
|
||||||
|
* @param numTokens the total number of tokens contained in fields named
|
||||||
|
* <i>fieldName</i> of <i>doc</i>.
|
||||||
|
* @return a normalization factor for hits on this field of this document
|
||||||
|
*
|
||||||
|
* @see Field#setBoost(float)
|
||||||
|
*/
|
||||||
|
public abstract float lengthNorm(String fieldName, int numTokens);
|
||||||
|
|
||||||
|
/** Computes the normalization value for a query given the sum of the squared
|
||||||
|
* weights of each of the query terms. This value is then multipled into the
|
||||||
|
* weight of each query term.
|
||||||
|
*
|
||||||
|
* <p>This does not affect ranking, but rather just attempts to make scores
|
||||||
|
* from different queries comparable.
|
||||||
|
*
|
||||||
|
* @param sumOfSquaredWeights the sum of the squares of query term weights
|
||||||
|
* @return a normalization factor for query weights
|
||||||
|
*/
|
||||||
|
public abstract float queryNorm(float sumOfSquaredWeights);
|
||||||
|
|
||||||
/** Encodes a normalization factor for storage in an index.
|
/** Encodes a normalization factor for storage in an index.
|
||||||
*
|
*
|
||||||
* <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
|
* <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
|
||||||
|
@ -151,25 +220,118 @@ public abstract class Similarity {
|
||||||
return (byte)((exponent << 3) | mantissa); // pack into a byte
|
return (byte)((exponent << 3) | mantissa); // pack into a byte
|
||||||
}
|
}
|
||||||
|
|
||||||
static final float tf(int freq) {
|
|
||||||
return (float)Math.sqrt(freq);
|
/** Computes a score factor based on a term or phrase's frequency in a
|
||||||
|
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
|
||||||
|
* factor for each term in the query and these products are then summed to
|
||||||
|
* form the initial score for a document.
|
||||||
|
*
|
||||||
|
* <p>Terms and phrases repeated in a document indicate the topic of the
|
||||||
|
* document, so implemenations of this method usually return larger values
|
||||||
|
* when <code>freq</code> is large, and smaller values when <code>freq</code>
|
||||||
|
* is small.
|
||||||
|
*
|
||||||
|
* <p>The default implementation calls {@link #tf(float)}.
|
||||||
|
*
|
||||||
|
* @param tf the frequency of a term within a document
|
||||||
|
* @return a score factor based on a term's within-document frequency
|
||||||
|
*/
|
||||||
|
public float tf(int freq) {
|
||||||
|
return tf((float)freq);
|
||||||
}
|
}
|
||||||
|
|
||||||
static final float tf(float freq) {
|
/** Computes the amount of a sloppy phrase match, based on an edit distance.
|
||||||
return (float)Math.sqrt(freq);
|
* This value is summed for each sloppy phrase match in a document to form
|
||||||
}
|
* the frequency that is passed to {@link #tf(float)}.
|
||||||
|
*
|
||||||
|
* <p>A phrase match with a small edit distance to a document passage more
|
||||||
|
* closely matches the document, so implemenations of this method usually
|
||||||
|
* return larger values when the edit distance is small and smaller values
|
||||||
|
* when it is large.
|
||||||
|
*
|
||||||
|
* @see PhraseQuery#setSlop(int)
|
||||||
|
* @param distance the edit distance of this sloppy phrase match
|
||||||
|
* @return the frequency increment for this match
|
||||||
|
*/
|
||||||
|
public abstract float sloppyFreq(int distance);
|
||||||
|
|
||||||
static final float idf(Term term, Searcher searcher) throws IOException {
|
/** Computes a score factor based on a term or phrase's frequency in a
|
||||||
// Use maxDoc() instead of numDocs() because its proportional to docFreq(),
|
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
|
||||||
// i.e., when one is inaccurate, so is the other, and in the same way.
|
* factor for each term in the query and these products are then summed to
|
||||||
|
* form the initial score for a document.
|
||||||
|
*
|
||||||
|
* <p>Terms and phrases repeated in a document indicate the topic of the
|
||||||
|
* document, so implemenations of this method usually return larger values
|
||||||
|
* when <code>freq</code> is large, and smaller values when <code>freq</code>
|
||||||
|
* is small.
|
||||||
|
*
|
||||||
|
* @param tf the frequency of a term within a document
|
||||||
|
* @return a score factor based on a term's within-document frequency
|
||||||
|
*/
|
||||||
|
public abstract float tf(float freq);
|
||||||
|
|
||||||
|
/** Computes a score factor for a simple term.
|
||||||
|
*
|
||||||
|
* <p>The default implementation is:<pre>
|
||||||
|
* return idf(searcher.docFreq(term), searcher.maxDoc());
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* Note that {@link Searcher#maxDoc()} is used instead of {@link
|
||||||
|
* IndexReader#numDocs()} because it is proportional to {@link
|
||||||
|
* Searcher#docFreq(Term)} , i.e., when one is inaccurate, so is the other,
|
||||||
|
* and in the same direction.
|
||||||
|
*
|
||||||
|
* @param term the term in question
|
||||||
|
* @param searcher the document collection being searched
|
||||||
|
* @return a score factor for the term
|
||||||
|
*/
|
||||||
|
public float idf(Term term, Searcher searcher) throws IOException {
|
||||||
return idf(searcher.docFreq(term), searcher.maxDoc());
|
return idf(searcher.docFreq(term), searcher.maxDoc());
|
||||||
}
|
}
|
||||||
|
|
||||||
static final float idf(int docFreq, int numDocs) {
|
/** Computes a score factor for a phrase.
|
||||||
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
*
|
||||||
|
* <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
|
||||||
|
* for each term in the phrase.
|
||||||
|
*
|
||||||
|
* @param terms the vector of terms in the phrase
|
||||||
|
* @param searcher the document collection being searched
|
||||||
|
* @return a score factor for the phrase
|
||||||
|
*/
|
||||||
|
public float idf(Vector terms, Searcher searcher) throws IOException {
|
||||||
|
float idf = 0.0f;
|
||||||
|
for (int i = 0; i < terms.size(); i++) {
|
||||||
|
idf += idf((Term)terms.elementAt(i), searcher);
|
||||||
|
}
|
||||||
|
return idf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static final float coord(int overlap, int maxOverlap) {
|
/** Computes a score factor based on a term's document frequency (the number
|
||||||
return overlap / (float)maxOverlap;
|
* of documents which contain the term). This value is multiplied by the
|
||||||
}
|
* {@link #tf(int)} factor for each term in the query and these products are
|
||||||
|
* then summed to form the initial score for a document.
|
||||||
|
*
|
||||||
|
* <p>Terms that occur in fewer documents are better indicators of topic, so
|
||||||
|
* implemenations of this method usually return larger values for rare terms,
|
||||||
|
* and smaller values for common terms.
|
||||||
|
*
|
||||||
|
* @param docFreq the number of documents which contain the term
|
||||||
|
* @param numDocs the total number of documents in the collection
|
||||||
|
* @return a score factor based on the term's document frequency
|
||||||
|
*/
|
||||||
|
protected abstract float idf(int docFreq, int numDocs);
|
||||||
|
|
||||||
|
/** Computes a score factor based on the fraction of all query terms that a
|
||||||
|
* document contains. This value is multiplied into scores.
|
||||||
|
*
|
||||||
|
* <p>The presence of a large portion of the query terms indicates a better
|
||||||
|
* match with the query, so implemenations of this method usually return
|
||||||
|
* larger values when the ratio between these parameters is large and smaller
|
||||||
|
* values when the ratio between them is small.
|
||||||
|
*
|
||||||
|
* @param overlap the number of query terms matched in the document
|
||||||
|
* @param maxOverlap the total number of terms in the query
|
||||||
|
* @return a score factor based on term overlap with the query
|
||||||
|
*/
|
||||||
|
public abstract float coord(int overlap, int maxOverlap);
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,10 +62,10 @@ import org.apache.lucene.index.*;
|
||||||
final class SloppyPhraseScorer extends PhraseScorer {
|
final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
private int slop;
|
private int slop;
|
||||||
|
|
||||||
SloppyPhraseScorer(TermPositions[] tps, int s, byte[] n, float w)
|
SloppyPhraseScorer(TermPositions[] tps, Similarity similarity,
|
||||||
throws IOException {
|
int slop, byte[] norms, float weight) throws IOException {
|
||||||
super(tps, n, w);
|
super(tps, similarity, norms, weight);
|
||||||
slop = s;
|
this.slop = slop;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final float phraseFreq() throws IOException {
|
protected final float phraseFreq() throws IOException {
|
||||||
|
@ -94,7 +94,7 @@ final class SloppyPhraseScorer extends PhraseScorer {
|
||||||
|
|
||||||
int matchLength = end - start;
|
int matchLength = end - start;
|
||||||
if (matchLength <= slop)
|
if (matchLength <= slop)
|
||||||
freq += 1.0 / (matchLength + 1); // penalize longer matches
|
freq += getSimilarity().sloppyFreq(matchLength); // score match
|
||||||
|
|
||||||
if (pp.position > end)
|
if (pp.position > end)
|
||||||
end = pp.position;
|
end = pp.position;
|
||||||
|
|
|
@ -73,7 +73,7 @@ public class TermQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
final float sumOfSquaredWeights(Searcher searcher) throws IOException {
|
||||||
idf = Similarity.idf(term, searcher);
|
idf = searcher.getSimilarity().idf(term, searcher);
|
||||||
weight = idf * boost;
|
weight = idf * boost;
|
||||||
return weight * weight; // square term weights
|
return weight * weight; // square term weights
|
||||||
}
|
}
|
||||||
|
@ -83,14 +83,15 @@ public class TermQuery extends Query {
|
||||||
weight *= idf; // factor from document
|
weight *= idf; // factor from document
|
||||||
}
|
}
|
||||||
|
|
||||||
Scorer scorer(IndexReader reader)
|
Scorer scorer(IndexReader reader, Similarity similarity)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
TermDocs termDocs = reader.termDocs(term);
|
TermDocs termDocs = reader.termDocs(term);
|
||||||
|
|
||||||
if (termDocs == null)
|
if (termDocs == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
return new TermScorer(termDocs, reader.norms(term.field()), weight);
|
return new TermScorer(termDocs, similarity,
|
||||||
|
reader.norms(term.field()), weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Prints a user-readable version of this query. */
|
/** Prints a user-readable version of this query. */
|
||||||
|
|
|
@ -63,21 +63,23 @@ final class TermScorer extends Scorer {
|
||||||
private float weight;
|
private float weight;
|
||||||
private int doc;
|
private int doc;
|
||||||
|
|
||||||
private final int[] docs = new int[128]; // buffered doc numbers
|
private final int[] docs = new int[32]; // buffered doc numbers
|
||||||
private final int[] freqs = new int[128]; // buffered term freqs
|
private final int[] freqs = new int[32]; // buffered term freqs
|
||||||
private int pointer;
|
private int pointer;
|
||||||
private int pointerMax;
|
private int pointerMax;
|
||||||
|
|
||||||
private static final int SCORE_CACHE_SIZE = 32;
|
private static final int SCORE_CACHE_SIZE = 32;
|
||||||
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
|
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
|
||||||
|
|
||||||
TermScorer(TermDocs td, byte[] n, float w) throws IOException {
|
TermScorer(TermDocs td, Similarity similarity, byte[] norms, float weight)
|
||||||
termDocs = td;
|
throws IOException {
|
||||||
norms = n;
|
super(similarity);
|
||||||
weight = w;
|
this.termDocs = td;
|
||||||
|
this.norms = norms;
|
||||||
|
this.weight = weight;
|
||||||
|
|
||||||
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
|
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
|
||||||
scoreCache[i] = Similarity.tf(i) * weight;
|
scoreCache[i] = getSimilarity().tf(i) * weight;
|
||||||
|
|
||||||
pointerMax = termDocs.read(docs, freqs); // fill buffers
|
pointerMax = termDocs.read(docs, freqs); // fill buffers
|
||||||
|
|
||||||
|
@ -91,12 +93,13 @@ final class TermScorer extends Scorer {
|
||||||
|
|
||||||
final void score(HitCollector c, final int end) throws IOException {
|
final void score(HitCollector c, final int end) throws IOException {
|
||||||
int d = doc; // cache doc in local
|
int d = doc; // cache doc in local
|
||||||
|
Similarity similarity = getSimilarity(); // cache sim in local
|
||||||
while (d < end) { // for docs in window
|
while (d < end) { // for docs in window
|
||||||
final int f = freqs[pointer];
|
final int f = freqs[pointer];
|
||||||
float score = // compute tf(f)*weight
|
float score = // compute tf(f)*weight
|
||||||
f < SCORE_CACHE_SIZE // check cache
|
f < SCORE_CACHE_SIZE // check cache
|
||||||
? scoreCache[f] // cache hit
|
? scoreCache[f] // cache hit
|
||||||
: Similarity.tf(f)*weight; // cache miss
|
: similarity.tf(f)*weight; // cache miss
|
||||||
|
|
||||||
score *= Similarity.decodeNorm(norms[d]); // normalize for field
|
score *= Similarity.decodeNorm(norms[d]); // normalize for field
|
||||||
|
|
||||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
import org.apache.lucene.demo.FileDocument;
|
import org.apache.lucene.demo.FileDocument;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -95,7 +96,8 @@ class DocTest {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
Directory directory = FSDirectory.getDirectory("test", false);
|
Directory directory = FSDirectory.getDirectory("test", false);
|
||||||
Analyzer analyzer = new SimpleAnalyzer();
|
Analyzer analyzer = new SimpleAnalyzer();
|
||||||
DocumentWriter writer = new DocumentWriter(directory, analyzer, 1000);
|
DocumentWriter writer =
|
||||||
|
new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000);
|
||||||
|
|
||||||
File file = new File(fileName);
|
File file = new File(fileName);
|
||||||
Document doc = FileDocument.Document(file);
|
Document doc = FileDocument.Document(file);
|
||||||
|
|
|
@ -76,7 +76,7 @@ public class TestDocBoost extends TestCase {
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void test() throws Exception {
|
public void testDocBoost() throws Exception {
|
||||||
RAMDirectory store = new RAMDirectory();
|
RAMDirectory store = new RAMDirectory();
|
||||||
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
|
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,161 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
|
/** Similarity unit test.
|
||||||
|
*
|
||||||
|
* @author Doug Cutting
|
||||||
|
* @version $Revision$
|
||||||
|
*/
|
||||||
|
public class TestSimilarity extends TestCase {
|
||||||
|
public TestSimilarity(String name) {
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class SimpleSimilarity extends Similarity {
|
||||||
|
public float lengthNorm(String field, int numTerms) { return 1.0f; }
|
||||||
|
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
|
||||||
|
public float tf(float freq) { return freq; }
|
||||||
|
public float sloppyFreq(int distance) { return 2.0f; }
|
||||||
|
public float idf(Vector terms, Searcher searcher) { return 1.0f; }
|
||||||
|
public float idf(int docFreq, int numDocs) { return 1.0f; }
|
||||||
|
public float coord(int overlap, int maxOverlap) { return 1.0f; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimilarity() throws Exception {
|
||||||
|
RAMDirectory store = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
|
||||||
|
writer.setSimilarity(new SimpleSimilarity());
|
||||||
|
|
||||||
|
Document d1 = new Document();
|
||||||
|
d1.add(Field.Text("field", "a c"));
|
||||||
|
|
||||||
|
Document d2 = new Document();
|
||||||
|
d2.add(Field.Text("field", "a b c"));
|
||||||
|
|
||||||
|
writer.addDocument(d1);
|
||||||
|
writer.addDocument(d2);
|
||||||
|
writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
final float[] scores = new float[4];
|
||||||
|
|
||||||
|
Searcher searcher = new IndexSearcher(store);
|
||||||
|
searcher.setSimilarity(new SimpleSimilarity());
|
||||||
|
|
||||||
|
Term a = new Term("field", "a");
|
||||||
|
Term b = new Term("field", "b");
|
||||||
|
Term c = new Term("field", "c");
|
||||||
|
|
||||||
|
searcher.search
|
||||||
|
(new TermQuery(b),
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
assertTrue(score == 1.0f);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
BooleanQuery bq = new BooleanQuery();
|
||||||
|
bq.add(new TermQuery(a), false, false);
|
||||||
|
bq.add(new TermQuery(b), false, false);
|
||||||
|
//System.out.println(bq.toString("field"));
|
||||||
|
searcher.search
|
||||||
|
(bq,
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
//System.out.println("Doc=" + doc + " score=" + score);
|
||||||
|
assertTrue(score == (float)doc+1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
PhraseQuery pq = new PhraseQuery();
|
||||||
|
pq.add(a);
|
||||||
|
pq.add(c);
|
||||||
|
//System.out.println(pq.toString("field"));
|
||||||
|
searcher.search
|
||||||
|
(pq,
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
//System.out.println("Doc=" + doc + " score=" + score);
|
||||||
|
assertTrue(score == 1.0f);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
pq.setSlop(2);
|
||||||
|
//System.out.println(pq.toString("field"));
|
||||||
|
searcher.search
|
||||||
|
(pq,
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
//System.out.println("Doc=" + doc + " score=" + score);
|
||||||
|
assertTrue(score == 2.0f);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue