LUCENE-2392: decouple vector space scoring from Query/Weight/Scorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-08 05:08:05 +00:00
parent 4163a4a955
commit ea67cd8b2c
80 changed files with 1814 additions and 1656 deletions

View File

@ -156,6 +156,12 @@ Changes in backwards compatibility policy
the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt
for more information (Chris Male) for more information (Chris Male)
* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you
extended Similarity directly before, you should extend TFIDFSimilarity instead.
Similarity is now a lower-level API to implement other scoring algorithms.
See MIGRATE.txt for more details.
(David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir)
Changes in Runtime Behavior Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you * LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you

View File

@ -382,3 +382,13 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
- o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource - o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource
- o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource - o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource
- o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery - o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery
* LUCENE-2392: Enable flexible scoring:
The existing "Similarity" api is now TFIDFSimilarity, if you were extending
Similarity before, you should likely extend this instead.
Weight.normalize no longer takes a norm value that incorporates the top-level
boost from outer queries such as BooleanQuery, instead it takes 2 parameters,
the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has
been renamed to Weight.getValueForNormalization().

View File

@ -240,8 +240,7 @@ public class InstantiatedIndexWriter implements Closeable {
final FieldInvertState invertState = new FieldInvertState(); final FieldInvertState invertState = new FieldInvertState();
invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost());
invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
final float norm = similarityProvider.get(fieldName).computeNorm(invertState); normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).computeNorm(invertState);
normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).encodeNormValue(norm);
} else { } else {
System.currentTimeMillis(); System.currentTimeMillis();
} }

View File

@ -51,7 +51,6 @@ import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.search.Collector; import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -1202,15 +1201,14 @@ public class MemoryIndex {
int numOverlapTokens = info != null ? info.numOverlapTokens : 0; int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
float boost = info != null ? info.getBoost() : 1.0f; float boost = info != null ? info.getBoost() : 1.0f;
FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost); FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
float n = fieldSim.computeNorm(invertState); byte norm = fieldSim.computeNorm(invertState);
byte norm = fieldSim.encodeNormValue(n);
norms = new byte[] {norm}; norms = new byte[] {norm};
// cache it for future reuse // cache it for future reuse
cachedNorms = norms; cachedNorms = norms;
cachedFieldName = fieldName; cachedFieldName = fieldName;
cachedSimilarity = sim; cachedSimilarity = sim;
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + norm + ":" + numTokens);
} }
return norms; return norms;
} }

View File

@ -147,7 +147,7 @@ public class FieldNormModifier {
for (int d = 0; d < termCounts.length; d++) { for (int d = 0; d < termCounts.length; d++) {
if (liveDocs == null || liveDocs.get(d)) { if (liveDocs == null || liveDocs.get(d)) {
invertState.setLength(termCounts[d]); invertState.setLength(termCounts[d]);
subReader.setNorm(d, field, fieldSim.encodeNormValue(fieldSim.computeNorm(invertState))); subReader.setNorm(d, field, fieldSim.computeNorm(invertState));
} }
} }
} }

View File

@ -106,7 +106,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
* discountOverlaps is true by default or true for this * discountOverlaps is true by default or true for this
* specific field. */ * specific field. */
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
final int numTokens; final int numTokens;
if (discountOverlaps) if (discountOverlaps)
@ -114,7 +114,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
else else
numTokens = state.getLength(); numTokens = state.getLength();
return state.getBoost() * computeLengthNorm(numTokens); return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens));
} }
/** /**

View File

@ -49,8 +49,8 @@ public class TestFieldNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }

View File

@ -21,6 +21,7 @@ package org.apache.lucene.misc;
import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
@ -58,15 +59,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("3,10: spot i="+i, assertEquals("3,10: spot i="+i,
1.0f, 1.0f,
s.computeNorm(invertState), ss.decodeNormValue(s.computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 10; i < 1000; i++) { for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9); invertState.setLength(i-9);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = s.computeNorm(invertState); final byte normS = s.computeNorm(invertState);
assertEquals("3,10: 10<x : i="+i, assertEquals("3,10: 10<x : i="+i,
normD, normD,
normS, normS,
@ -104,14 +105,14 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 3,10: spot i="+i, assertEquals("f: 3,10: spot i="+i,
1.0f, 1.0f,
sp.get("foo").computeNorm(invertState), ss.decodeNormValue(sp.get("foo").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 10; i < 1000; i++) { for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9); invertState.setLength(i-9);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("foo").computeNorm(invertState); final byte normS = sp.get("foo").computeNorm(invertState);
assertEquals("f: 3,10: 10<x : i="+i, assertEquals("f: 3,10: 10<x : i="+i,
normD, normD,
normS, normS,
@ -121,21 +122,21 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 8,13: spot i="+i, assertEquals("f: 8,13: spot i="+i,
1.0f, 1.0f,
sp.get("bar").computeNorm(invertState), ss.decodeNormValue(sp.get("bar").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 6; i <=9; i++) { for (int i = 6; i <=9; i++) {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 6,9: spot i="+i, assertEquals("f: 6,9: spot i="+i,
1.0f, 1.0f,
sp.get("yak").computeNorm(invertState), ss.decodeNormValue(sp.get("yak").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 13; i < 1000; i++) { for (int i = 13; i < 1000; i++) {
invertState.setLength(i-12); invertState.setLength(i-12);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("bar").computeNorm(invertState); final byte normS = sp.get("bar").computeNorm(invertState);
assertEquals("f: 8,13: 13<x : i="+i, assertEquals("f: 8,13: 13<x : i="+i,
normD, normD,
normS, normS,
@ -143,9 +144,9 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
} }
for (int i = 9; i < 1000; i++) { for (int i = 9; i < 1000; i++) {
invertState.setLength(i-8); invertState.setLength(i-8);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("yak").computeNorm(invertState); final byte normS = sp.get("yak").computeNorm(invertState);
assertEquals("f: 6,9: 9<x : i="+i, assertEquals("f: 6,9: 9<x : i="+i,
normD, normD,
normS, normS,
@ -157,8 +158,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
for (int i = 9; i < 1000; i++) { for (int i = 9; i < 1000; i++) {
invertState.setLength(i); invertState.setLength(i);
final float normSS = sp.get("a").computeNorm(invertState); final byte normSS = sp.get("a").computeNorm(invertState);
final float normS = sp.get("b").computeNorm(invertState); final byte normS = sp.get("b").computeNorm(invertState);
assertTrue("s: i="+i+" : a="+normSS+ assertTrue("s: i="+i+" : a="+normSS+
" < b="+normS, " < b="+normS,
normSS < normS); normSS < normS);
@ -170,8 +171,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
SweetSpotSimilarity ss = new SweetSpotSimilarity(); SweetSpotSimilarity ss = new SweetSpotSimilarity();
Similarity d = new DefaultSimilarity(); TFIDFSimilarity d = new DefaultSimilarity();
Similarity s = ss; TFIDFSimilarity s = ss;
// tf equal // tf equal
@ -222,7 +223,7 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
}; };
ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f); ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f);
Similarity s = ss; TFIDFSimilarity s = ss;
for (int i = 1; i <=1000; i++) { for (int i = 1; i <=1000; i++) {
assertTrue("MIN tf: i="+i+" : s="+s.tf(i), assertTrue("MIN tf: i="+i+" : s="+s.tf(i),

View File

@ -54,8 +54,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }
@ -175,8 +175,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }

View File

@ -51,7 +51,11 @@ import org.apache.lucene.util.PriorityQueue;
*/ */
public class FuzzyLikeThisQuery extends Query public class FuzzyLikeThisQuery extends Query
{ {
static Similarity sim=new DefaultSimilarity(); // TODO: generalize this query (at least it should not reuse this static sim!
// a better way might be to convert this into multitermquery rewrite methods.
// the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq)
// provided to TermQuery, so that the general idea is agnostic to any scoring system...
static TFIDFSimilarity sim=new DefaultSimilarity();
Query rewrittenQuery=null; Query rewrittenQuery=null;
ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>(); ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
Analyzer analyzer; Analyzer analyzer;

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
@ -285,7 +286,7 @@ public final class MoreLikeThis {
/** /**
* For idf() calculations. * For idf() calculations.
*/ */
private Similarity similarity;// = new DefaultSimilarity(); private TFIDFSimilarity similarity;// = new DefaultSimilarity();
/** /**
* IndexReader to use * IndexReader to use
@ -320,17 +321,17 @@ public final class MoreLikeThis {
this(ir, new DefaultSimilarity()); this(ir, new DefaultSimilarity());
} }
public MoreLikeThis(IndexReader ir, Similarity sim){ public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){
this.ir = ir; this.ir = ir;
this.similarity = sim; this.similarity = sim;
} }
public Similarity getSimilarity() { public TFIDFSimilarity getSimilarity() {
return similarity; return similarity;
} }
public void setSimilarity(Similarity similarity) { public void setSimilarity(TFIDFSimilarity similarity) {
this.similarity = similarity; this.similarity = similarity;
} }

View File

@ -81,13 +81,13 @@ public abstract class AbstractField implements Fieldable {
* default, in the {@link * default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied * org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor and then * by the length normalization factor and then
* rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the * rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow * index. One should attempt to ensure that this product does not overflow
* the range of that encoding. * the range of that encoding.
* *
* @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float) * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/ */
public void setBoost(float boost) { public void setBoost(float boost) {
this.boost = boost; this.boost = boost;

View File

@ -48,13 +48,13 @@ public interface Fieldable {
* default, in the {@link * default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied * org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor * by the length normalization factor
* and then rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the * and then rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow * index. One should attempt to ensure that this product does not overflow
* the range of that encoding. * the range of that encoding.
* *
* @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float) * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/ */
void setBoost(float boost); void setBoost(float boost);

View File

@ -1025,7 +1025,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
public abstract byte[] norms(String field) throws IOException; public abstract byte[] norms(String field) throws IOException;
/** Expert: Resets the normalization factor for the named field of the named /** Expert: Resets the normalization factor for the named field of the named
* document. The norm represents the product of the field's {@link * document. By default, The norm represents the product of the field's {@link
* org.apache.lucene.document.Fieldable#setBoost(float) boost} and its * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its
* length normalization}. Thus, to preserve the length normalization * length normalization}. Thus, to preserve the length normalization
* values when resetting this, one should base the new value upon the old. * values when resetting this, one should base the new value upon the old.
@ -1034,7 +1034,8 @@ public abstract class IndexReader implements Cloneable,Closeable {
* this method throws {@link IllegalStateException}. * this method throws {@link IllegalStateException}.
* *
* @see #norms(String) * @see #norms(String)
* @see Similarity#decodeNormValue(byte) * @see Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
* @throws StaleReaderException if the index has changed * @throws StaleReaderException if the index has changed
* since this reader was opened * since this reader was opened
* @throws CorruptIndexException if the index is corrupt * @throws CorruptIndexException if the index is corrupt

View File

@ -72,8 +72,7 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
assert norms.length == upto; assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1+upto); norms = ArrayUtil.grow(norms, 1+upto);
} }
final float norm = similarity.computeNorm(fieldState); norms[upto] = similarity.computeNorm(fieldState);
norms[upto] = similarity.encodeNormValue(norm);
docIDs[upto] = docState.docID; docIDs[upto] = docState.docID;
upto++; upto++;
} }

View File

@ -183,14 +183,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
public Query getQuery() { return BooleanQuery.this; } public Query getQuery() { return BooleanQuery.this; }
@Override @Override
public float getValue() { return getBoost(); } public float getValueForNormalization() throws IOException {
@Override
public float sumOfSquaredWeights() throws IOException {
float sum = 0.0f; float sum = 0.0f;
for (int i = 0 ; i < weights.size(); i++) { for (int i = 0 ; i < weights.size(); i++) {
// call sumOfSquaredWeights for all clauses in case of side effects // call sumOfSquaredWeights for all clauses in case of side effects
float s = weights.get(i).sumOfSquaredWeights(); // sum sub weights float s = weights.get(i).getValueForNormalization(); // sum sub weights
if (!clauses.get(i).isProhibited()) if (!clauses.get(i).isProhibited())
// only add to sum for non-prohibited clauses // only add to sum for non-prohibited clauses
sum += s; sum += s;
@ -206,11 +203,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); // incorporate boost topLevelBoost *= getBoost(); // incorporate boost
for (Weight w : weights) { for (Weight w : weights) {
// normalize all clauses, (even if prohibited in case of side affects) // normalize all clauses, (even if prohibited in case of side affects)
w.normalize(norm); w.normalize(norm, topLevelBoost);
} }
} }

View File

@ -27,7 +27,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -77,7 +77,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD); topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
} }
@ -140,9 +140,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
assert termState != null; assert termState != null;
if (pos < 0) { if (pos < 0) {
pos = (-pos)-1; pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq()); array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else { } else {
array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq()); array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
return true; return true;
} }
@ -183,9 +183,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
return true; return true;
} }
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */ /** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */
static final class TermStateByteStart extends DirectBytesStartArray { static final class TermStateByteStart extends DirectBytesStartArray {
PerReaderTermState[] termState; TermContext[] termState;
public TermStateByteStart(int initSize) { public TermStateByteStart(int initSize) {
super(initSize); super(initSize);
@ -194,7 +194,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
@Override @Override
public int[] init() { public int[] init() {
final int[] ord = super.init(); final int[] ord = super.init();
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length; assert termState.length >= ord.length;
return ord; return ord;
} }
@ -203,7 +203,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
public int[] grow() { public int[] grow() {
final int[] ord = super.grow(); final int[] ord = super.grow();
if (termState.length < ord.length) { if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length); System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState; termState = tmpTermState;
} }

View File

@ -110,24 +110,19 @@ public class ConstantScoreQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
// we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything) // we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.sumOfSquaredWeights(); if (innerWeight != null) innerWeight.getValueForNormalization();
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
// we normalize the inner weight, but ignore it (just to initialize everything) // we normalize the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.normalize(norm); if (innerWeight != null) innerWeight.normalize(norm, topLevelBoost);
} }
@Override @Override
@ -148,7 +143,7 @@ public class ConstantScoreQuery extends Query {
if (disi == null) { if (disi == null) {
return null; return null;
} }
return new ConstantScorer(disi, this); return new ConstantScorer(disi, this, queryWeight);
} }
@Override @Override
@ -181,9 +176,9 @@ public class ConstantScoreQuery extends Query {
final DocIdSetIterator docIdSetIterator; final DocIdSetIterator docIdSetIterator;
final float theScore; final float theScore;
public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException { public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w, float theScore) throws IOException {
super(w); super(w);
theScore = w.getValue(); this.theScore = theScore;
this.docIdSetIterator = docIdSetIterator; this.docIdSetIterator = docIdSetIterator;
} }
@ -212,7 +207,7 @@ public class ConstantScoreQuery extends Query {
@Override @Override
public void setScorer(Scorer scorer) throws IOException { public void setScorer(Scorer scorer) throws IOException {
// we must wrap again here, but using the scorer passed in as parameter: // we must wrap again here, but using the scorer passed in as parameter:
collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight)); collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight, ConstantScorer.this.theScore));
} }
@Override @Override

View File

@ -20,7 +20,7 @@ import org.apache.lucene.index.FieldInvertState;
*/ */
/** Expert: Default scoring implementation. */ /** Expert: Default scoring implementation. */
public class DefaultSimilarity extends Similarity { public class DefaultSimilarity extends TFIDFSimilarity {
/** Implemented as /** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where * <code>state.getBoost()*lengthNorm(numTerms)</code>, where
@ -31,13 +31,13 @@ public class DefaultSimilarity extends Similarity {
* *
* @lucene.experimental */ * @lucene.experimental */
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
final int numTerms; final int numTerms;
if (discountOverlaps) if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap(); numTerms = state.getLength() - state.getNumOverlap();
else else
numTerms = state.getLength(); numTerms = state.getLength();
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
} }
/** Implemented as <code>sqrt(freq)</code>. */ /** Implemented as <code>sqrt(freq)</code>. */

View File

@ -110,16 +110,12 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
@Override @Override
public Query getQuery() { return DisjunctionMaxQuery.this; } public Query getQuery() { return DisjunctionMaxQuery.this; }
/** Return our boost */
@Override
public float getValue() { return getBoost(); }
/** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ /** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
float max = 0.0f, sum = 0.0f; float max = 0.0f, sum = 0.0f;
for (Weight currentWeight : weights) { for (Weight currentWeight : weights) {
float sub = currentWeight.sumOfSquaredWeights(); float sub = currentWeight.getValueForNormalization();
sum += sub; sum += sub;
max = Math.max(max, sub); max = Math.max(max, sub);
@ -130,10 +126,10 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
/** Apply the computed normalization factor to our subqueries */ /** Apply the computed normalization factor to our subqueries */
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); // Incorporate our boost topLevelBoost *= getBoost(); // Incorporate our boost
for (Weight wt : weights) { for (Weight wt : weights) {
wt.normalize(norm); wt.normalize(norm, topLevelBoost);
} }
} }

View File

@ -23,12 +23,6 @@ import java.util.Arrays;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
final class ExactPhraseScorer extends Scorer { final class ExactPhraseScorer extends Scorer {
private final byte[] norms;
private final float value;
private static final int SCORE_CACHE_SIZE = 32;
private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
private final int endMinus1; private final int endMinus1;
private final static int CHUNK = 4096; private final static int CHUNK = 4096;
@ -60,14 +54,12 @@ final class ExactPhraseScorer extends Scorer {
private int docID = -1; private int docID = -1;
private int freq; private int freq;
private final Similarity similarity; private final Similarity.ExactDocScorer docScorer;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) throws IOException { Similarity.ExactDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.norms = norms;
this.value = weight.getValue();
chunkStates = new ChunkState[postings.length]; chunkStates = new ChunkState[postings.length];
@ -88,10 +80,6 @@ final class ExactPhraseScorer extends Scorer {
return; return;
} }
} }
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
scoreCache[i] = similarity.tf((float) i) * value;
}
} }
@Override @Override
@ -206,13 +194,7 @@ final class ExactPhraseScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
final float raw; // raw score return docScorer.score(docID, freq);
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
raw = similarity.tf((float) freq) * value;
}
return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize
} }
private int phraseFreq() throws IOException { private int phraseFreq() throws IOException {

View File

@ -125,25 +125,4 @@ public class Explanation {
return buffer.toString(); return buffer.toString();
} }
/**
* Small Util class used to pass both an idf factor as well as an
* explanation for that factor.
*
* This class will likely be held on a {@link Weight}, so be aware
* before storing any large or un-serializable fields.
*
*/
public static abstract class IDFExplanation {
/**
* @return the idf factor
*/
public abstract float getIdf();
/**
* This should be calculated lazily if possible.
*
* @return the explanation for the idf factor.
*/
public abstract String explain();
}
} }

View File

@ -63,21 +63,15 @@ extends Query {
public Weight createWeight(final IndexSearcher searcher) throws IOException { public Weight createWeight(final IndexSearcher searcher) throws IOException {
final Weight weight = query.createWeight (searcher); final Weight weight = query.createWeight (searcher);
return new Weight() { return new Weight() {
private float value;
// pass these methods through to enclosed query's weight
@Override
public float getValue() { return value; }
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
return weight.sumOfSquaredWeights() * getBoost() * getBoost(); return weight.getValueForNormalization() * getBoost() * getBoost();
} }
@Override @Override
public void normalize (float v) { public void normalize (float norm, float topLevelBoost) {
weight.normalize(v); weight.normalize(norm, topLevelBoost);
value = weight.getValue() * getBoost();
} }
@Override @Override

View File

@ -674,11 +674,11 @@ public class IndexSearcher {
public Weight createNormalizedWeight(Query query) throws IOException { public Weight createNormalizedWeight(Query query) throws IOException {
query = rewrite(query); query = rewrite(query);
Weight weight = query.createWeight(this); Weight weight = query.createWeight(this);
float sum = weight.sumOfSquaredWeights(); float v = weight.getValueForNormalization();
float norm = getSimilarityProvider().queryNorm(sum); float norm = getSimilarityProvider().queryNorm(v);
if (Float.isInfinite(norm) || Float.isNaN(norm)) if (Float.isInfinite(norm) || Float.isNaN(norm))
norm = 1.0f; norm = 1.0f;
weight.normalize(norm); weight.normalize(norm, 1.0f);
return weight; return weight;
} }

View File

@ -32,35 +32,17 @@ import java.io.IOException;
*/ */
public class MatchAllDocsQuery extends Query { public class MatchAllDocsQuery extends Query {
public MatchAllDocsQuery() {
this(null);
}
private final String normsField;
/**
* @param normsField Field used for normalization factor (document boost). Null if nothing.
*/
public MatchAllDocsQuery(String normsField) {
this.normsField = normsField;
}
private class MatchAllScorer extends Scorer { private class MatchAllScorer extends Scorer {
final float score; final float score;
final byte[] norms;
private int doc = -1; private int doc = -1;
private final int maxDoc; private final int maxDoc;
private final Bits liveDocs; private final Bits liveDocs;
private final Similarity similarity;
MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, MatchAllScorer(IndexReader reader, Weight w, float score) throws IOException {
byte[] norms) throws IOException {
super(w); super(w);
this.similarity = similarity;
liveDocs = reader.getLiveDocs(); liveDocs = reader.getLiveDocs();
score = w.getValue(); this.score = score;
maxDoc = reader.maxDoc(); maxDoc = reader.maxDoc();
this.norms = norms;
} }
@Override @Override
@ -82,7 +64,7 @@ public class MatchAllDocsQuery extends Query {
@Override @Override
public float score() { public float score() {
return norms == null ? score : score * similarity.decodeNormValue(norms[docID()]); return score;
} }
@Override @Override
@ -93,12 +75,10 @@ public class MatchAllDocsQuery extends Query {
} }
private class MatchAllDocsWeight extends Weight { private class MatchAllDocsWeight extends Weight {
private Similarity similarity;
private float queryWeight; private float queryWeight;
private float queryNorm; private float queryNorm;
public MatchAllDocsWeight(IndexSearcher searcher) { public MatchAllDocsWeight(IndexSearcher searcher) {
this.similarity = normsField == null ? null : searcher.getSimilarityProvider().get(normsField);
} }
@Override @Override
@ -112,33 +92,27 @@ public class MatchAllDocsQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new MatchAllScorer(context.reader, similarity, this, return new MatchAllScorer(context.reader, this, queryWeight);
normsField != null ? context.reader.norms(normsField) : null);
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) { public Explanation explain(AtomicReaderContext context, int doc) {
// explain query weight // explain query weight
Explanation queryExpl = new ComplexExplanation Explanation queryExpl = new ComplexExplanation
(true, getValue(), "MatchAllDocsQuery, product of:"); (true, queryWeight, "MatchAllDocsQuery, product of:");
if (getBoost() != 1.0f) { if (getBoost() != 1.0f) {
queryExpl.addDetail(new Explanation(getBoost(),"boost")); queryExpl.addDetail(new Explanation(getBoost(),"boost"));
} }

View File

@ -22,12 +22,14 @@ import java.util.*;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -129,45 +131,35 @@ public class MultiPhraseQuery extends Query {
private class MultiPhraseWeight extends Weight { private class MultiPhraseWeight extends Weight {
private Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private final IDFExplanation idfExp;
private float idf;
private float queryNorm;
private float queryWeight;
public MultiPhraseWeight(IndexSearcher searcher) public MultiPhraseWeight(IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field); this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
// compute idf // compute idf
ArrayList<Term> allTerms = new ArrayList<Term>(); ArrayList<TermContext> allTerms = new ArrayList<TermContext>();
for(final Term[] terms: termArrays) { for(final Term[] terms: termArrays) {
for (Term term: terms) { for (Term term: terms) {
allTerms.add(term); allTerms.add(TermContext.build(context, term, true));
} }
} }
idfExp = similarity.idfExplain(allTerms, searcher); stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()]));
idf = idfExp.getIdf();
} }
@Override @Override
public Query getQuery() { return MultiPhraseQuery.this; } public Query getQuery() { return MultiPhraseQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -222,8 +214,7 @@ public class MultiPhraseQuery extends Query {
} }
if (slop == 0) { if (slop == 0) {
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
reader.norms(field));
if (s.noDocs) { if (s.noDocs) {
return null; return null;
} else { } else {
@ -231,87 +222,32 @@ public class MultiPhraseQuery extends Query {
} }
} else { } else {
return new SloppyPhraseScorer(this, postingsFreqs, similarity, return new SloppyPhraseScorer(this, postingsFreqs, similarity,
slop, reader.norms(field)); slop, similarity.sloppyDocScorer(stats, field, context));
} }
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + idfExp.explain() +")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+
"), product of:");
Scorer scorer = scorer(context, ScorerContext.def()); Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) { if (scorer != null) {
return new Explanation(0.0f, "no matching docs"); int newDoc = scorer.advance(doc);
} if (newDoc == doc) {
float freq = scorer.freq();
Explanation tfExplanation = new Explanation(); SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
int d = scorer.advance(doc); ComplexExplanation result = new ComplexExplanation();
float phraseFreq; result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
if (d == doc) { Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
phraseFreq = scorer.freq(); result.addDetail(scoreExplanation);
} else { result.setValue(scoreExplanation.getValue());
phraseFreq = 0.0f; result.setMatch(true);
}
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result; return result;
} }
} }
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
@Override @Override
public Query rewrite(IndexReader reader) { public Query rewrite(IndexReader reader) {
if (termArrays.size() == 1) { // optimize one-term case if (termArrays.size() == 1) { // optimize one-term case

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* An abstract {@link Query} that matches documents * An abstract {@link Query} that matches documents
@ -154,7 +154,7 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states); final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost); tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD); topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -195,7 +195,7 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states)); final Query q = new ConstantScoreQuery(new TermQuery(term, states));
q.setBoost(boost); q.setBoost(boost);
topLevel.add(q, BooleanClause.Occur.SHOULD); topLevel.add(q, BooleanClause.Occur.SHOULD);

View File

@ -22,10 +22,16 @@ import java.util.Set;
import java.util.ArrayList; import java.util.ArrayList;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -171,18 +177,17 @@ public class PhraseQuery extends Query {
private class PhraseWeight extends Weight { private class PhraseWeight extends Weight {
private final Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private float idf; private transient TermContext states[];
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
public PhraseWeight(IndexSearcher searcher) public PhraseWeight(IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field); this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
idfExp = similarity.idfExplain(terms, searcher); states = new TermContext[terms.size()];
idf = idfExp.getIdf(); for (int i = 0; i < terms.size(); i++)
states[i] = TermContext.build(context, terms.get(i), true);
stats = similarity.computeStats(searcher, field, getBoost(), states);
} }
@Override @Override
@ -192,19 +197,13 @@ public class PhraseQuery extends Query {
public Query getQuery() { return PhraseQuery.this; } public Query getQuery() { return PhraseQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -216,21 +215,26 @@ public class PhraseQuery extends Query {
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
for (int i = 0; i < terms.size(); i++) { for (int i = 0; i < terms.size(); i++) {
final Term t = terms.get(i); final Term t = terms.get(i);
final TermState state = states[i].get(context.ord);
if (state == null) { /* term doesnt exist in this segment */
assert termNotInReader(reader, field, t.bytes()) : "no termstate found but term exists in reader";
return null;
}
DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs, DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs,
t.field(), t.field(),
t.bytes()); t.bytes(),
state);
// PhraseQuery on a field that did not index // PhraseQuery on a field that did not index
// positions. // positions.
if (postingsEnum == null) { if (postingsEnum == null) {
if (reader.termDocsEnum(liveDocs, t.field(), t.bytes()) != null) { assert (reader.termDocsEnum(liveDocs, t.field(), t.bytes(), state) != null) : "termstate found but no term exists in reader";
// term does exist, but has no positions // term does exist, but has no positions
throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")");
} else {
// term does not exist
return null;
} }
} // get the docFreq without seeking
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t); TermsEnum te = reader.fields().terms(field).getThreadTermsEnum();
te.seekExact(t.bytes(), state);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t);
} }
// sort by increasing docFreq order // sort by increasing docFreq order
@ -239,8 +243,7 @@ public class PhraseQuery extends Query {
} }
if (slop == 0) { // optimize exact case if (slop == 0) { // optimize exact case
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
reader.norms(field));
if (s.noDocs) { if (s.noDocs) {
return null; return null;
} else { } else {
@ -248,99 +251,38 @@ public class PhraseQuery extends Query {
} }
} else { } else {
return return
new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context));
reader.norms(field));
} }
} }
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
// only called from assert
final Terms terms = reader.terms(field);
return terms == null || terms.docFreq(bytes) == 0;
}
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
StringBuilder docFreqs = new StringBuilder();
StringBuilder query = new StringBuilder();
query.append('\"');
docFreqs.append(idfExp.explain());
for (int i = 0; i < terms.size(); i++) {
if (i != 0) {
query.append(" ");
}
Term term = terms.get(i);
query.append(term.text());
}
query.append('\"');
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
"), product of:");
Scorer scorer = scorer(context, ScorerContext.def()); Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) { if (scorer != null) {
return new Explanation(0.0f, "no matching docs"); int newDoc = scorer.advance(doc);
} if (newDoc == doc) {
Explanation tfExplanation = new Explanation(); float freq = scorer.freq();
int d = scorer.advance(doc); SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
float phraseFreq; ComplexExplanation result = new ComplexExplanation();
if (d == doc) { result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
phraseFreq = scorer.freq(); Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
} else { result.addDetail(scoreExplanation);
phraseFreq = 0.0f; result.setValue(scoreExplanation.getValue());
} result.setMatch(true);
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
result.setMatch(tfExplanation.isMatch());
return result; return result;
} }
} }
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
@Override @Override
public Weight createWeight(IndexSearcher searcher) throws IOException { public Weight createWeight(IndexSearcher searcher) throws IOException {
if (terms.size() == 1) { // optimize one-term case if (terms.size() == 1) { // optimize one-term case

View File

@ -30,9 +30,6 @@ import java.io.IOException;
* means a match. * means a match.
*/ */
abstract class PhraseScorer extends Scorer { abstract class PhraseScorer extends Scorer {
protected byte[] norms;
protected float value;
private boolean firstTime = true; private boolean firstTime = true;
private boolean more = true; private boolean more = true;
protected PhraseQueue pq; protected PhraseQueue pq;
@ -40,14 +37,12 @@ abstract class PhraseScorer extends Scorer {
private float freq; //phrase frequency in current doc as computed by phraseFreq(). private float freq; //phrase frequency in current doc as computed by phraseFreq().
protected final Similarity similarity; protected final Similarity.SloppyDocScorer docScorer;
PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) { Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.norms = norms;
this.value = weight.getValue();
// convert tps to a list of phrase positions. // convert tps to a list of phrase positions.
// note: phrase-position differs from term-position in that its position // note: phrase-position differs from term-position in that its position
@ -107,9 +102,7 @@ abstract class PhraseScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
//System.out.println("scoring " + first.doc); return docScorer.score(first.doc, freq);
float raw = similarity.tf(freq) * value; // raw score
return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize
} }
@Override @Override

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -56,7 +56,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, protected void addClause(BooleanQuery topLevel, Term term, int docCount,
float boost, PerReaderTermState states) { float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states); final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost); tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD); topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -117,7 +117,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (size > 0) { if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final float[] boost = col.array.boost; final float[] boost = col.array.boost;
final PerReaderTermState[] termStates = col.array.termState; final TermContext[] termStates = col.array.termState;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
final int pos = sort[i]; final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
@ -150,12 +150,12 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (e < 0 ) { if (e < 0 ) {
// duplicate term: update docFreq // duplicate term: update docFreq
final int pos = (-e)-1; final int pos = (-e)-1;
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq()); array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else { } else {
// new entry: we populate the entry initially // new entry: we populate the entry initially
array.boost[e] = boostAtt.getBoost(); array.boost[e] = boostAtt.getBoost();
array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq()); array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
ScoringRewrite.this.checkMaxClauseCount(terms.size()); ScoringRewrite.this.checkMaxClauseCount(terms.size());
} }
return true; return true;
@ -165,7 +165,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray { static final class TermFreqBoostByteStart extends DirectBytesStartArray {
float[] boost; float[] boost;
PerReaderTermState[] termState; TermContext[] termState;
public TermFreqBoostByteStart(int initSize) { public TermFreqBoostByteStart(int initSize) {
super(initSize); super(initSize);
@ -175,7 +175,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
public int[] init() { public int[] init() {
final int[] ord = super.init(); final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length && boost.length >= ord.length; assert termState.length >= ord.length && boost.length >= ord.length;
return ord; return ord;
} }
@ -185,7 +185,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
final int[] ord = super.grow(); final int[] ord = super.grow();
boost = ArrayUtil.grow(boost, ord.length); boost = ArrayUtil.grow(boost, ord.length);
if (termState.length < ord.length) { if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length); System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState; termState = tmpTermState;
} }

View File

@ -19,594 +19,111 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.document.IndexDocValuesField; // javadoc
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.SmallFloat; import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.SmallFloat; // javadoc
import org.apache.lucene.util.TermContext;
/** /**
* Expert: Scoring API. * Similarity defines the components of Lucene scoring.
*
* <p>Similarity defines the components of Lucene scoring.
* Overriding computation of these components is a convenient
* way to alter Lucene scoring.
*
* <p>Suggested reading:
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html">
* Introduction To Information Retrieval, Chapter 6</a>.
*
* <p>The following describes how Lucene scoring evolves from
* underlying information retrieval models to (efficient) implementation.
* We first brief on <i>VSM Score</i>,
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>,
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i>
* (the latter is connected directly with Lucene classes and methods).
*
* <p>Lucene combines
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model">
* Boolean model (BM) of Information Retrieval</a>
* with
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
* Vector Space Model (VSM) of Information Retrieval</a> -
* documents "approved" by BM are scored by VSM.
*
* <p>In VSM, documents and queries are represented as
* weighted vectors in a multi-dimensional space,
* where each distinct index term is a dimension,
* and weights are
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values.
*
* <p>VSM does not require weights to be <i>Tf-idf</i> values,
* but <i>Tf-idf</i> values are believed to produce search results of high quality,
* and so Lucene is using <i>Tf-idf</i>.
* <i>Tf</i> and <i>Idf</i> are described in more detail below,
* but for now, for completion, let's just say that
* for given term <i>t</i> and document (or query) <i>x</i>,
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i>
* (when one increases so does the other) and
* <i>idf(t)</i> similarly varies with the inverse of the
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <ol>
* <li>
* <A NAME="formula_tf"></A>
* <b><i>tf(t in d)</i></b>
* correlates to the term's <i>frequency</i>,
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
* Documents that have more occurrences of a given term receive a higher score.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation,
* However if a query contains twice the same term, there will be
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the SimilarityProvider in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p> * <p>
* When a document is added to the index, all the above factors are multiplied. * Expert: Scoring API.
* If the document has multiple fields with the same name, all their boosts are multiplied together: * <p>
* * This is a low-level API, you should only extend this API if you want to implement
* <br>&nbsp;<br> * an information retrieval <i>model</i>. If you are instead looking for a convenient way
* <table cellpadding="1" cellspacing="0" border="0"n align="center"> * to alter Lucene's scoring, consider extending a higher-level implementation
* <tr> * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
* <td valign="middle" align="right" rowspan="1"> * just tweaking the default implementation: {@link DefaultSimilarity}.
* norm(t,d) &nbsp; = &nbsp; * <p>
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} * Similarity determines how Lucene weights terms, and Lucene interacts with
* &nbsp;&middot;&nbsp; * this class at both <a href="#indextime">index-time</a> and
* lengthNorm * <a href="#querytime">query-time</a>.
* &nbsp;&middot;&nbsp; * <p>
* </td> * <a name="indextime"/>
* <td valign="bottom" align="center" rowspan="1"> * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* <big><big><big>&prod;</big></big></big> * the Similarity implementation to return a per-document byte for the field that will
* </td> * be later accessible via {@link IndexReader#norms(String)}. Lucene makes no assumption
* <td valign="middle" align="right" rowspan="1"> * about what is in this byte, but it is most useful for encoding length normalization
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() * information.
* </td> * <p>
* </tr> * Implementations should carefully consider how the normalization byte is encoded: while
* <tr valigh="top"> * Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost
* <td></td> * and length normalization information with {@link SmallFloat}, this might not be suitable
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td> * for all purposes.
* <td></td> * <p>
* </tr> * Many formulas require the use of average document length, which can be computed via a
* </table> * combination of {@link Terms#getSumTotalTermFreq()} and {@link IndexReader#maxDoc()},
* <br>&nbsp;<br> * <p>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte * Because index-time boost is handled entirely at the application level anyway,
* before being stored. * an application can alternatively store the index-time boost separately using an
* At search time, the norm byte value is read from the index * {@link IndexDocValuesField}, and access this at query-time with
* {@link org.apache.lucene.store.Directory directory} and * {@link IndexReader#docValues(String)}.
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value. * <p>
* This encoding/decoding, while reducing index size, comes with the price of * Finally, using index-time boosts (either via folding into the normalization byte or
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. * via IndexDocValues), is an inefficient way to boost the scores of different fields if the
* For instance, <i>decode(encode(0.89)) = 0.75</i>. * boost will be the same for every document, instead the Similarity can simply take a constant
* <br>&nbsp;<br> * boost parameter <i>C</i>, and the SimilarityProvider can return different instances with
* Compression of norm values to a single byte saves memory at search time, * different boosts depending upon field name.
* because once a field is referenced at search time, its norms - for * <p>
* all documents - are maintained in memory. * <a name="querytime"/>
* <br>&nbsp;<br> * At query-time, Queries interact with the Similarity via these steps:
* The rationale supporting such lossy compression of norm values is that * <ol>
* given the difficulty (and inaccuracy) of users to express their true information * <li>The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time,
* need by a query, only big differences matter. * allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* <br>&nbsp;<br> * across <i>the entire collection</i>. The {@link TermContext}s passed in are already positioned
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by * to the terms involved with the raw statistics involved, so a Similarity can freely use any combination
* using a different {@link Similarity} for search. * of term statistics without causing any additional I/O. Lucene makes no assumption about what is
* <br>&nbsp;<br> * stored in the returned {@link Similarity.Stats} object.
* </li> * <li>The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()}
* is called for each query leaf node, {@link SimilarityProvider#queryNorm(float)} is called for the top-level
* query, and finally {@link Similarity.Stats#normalize(float, float)} passes down the normalization value
* and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s).
* <li>For each segment in the index, the Query creates a {@link #exactDocScorer(Stats, String, IndexReader.AtomicReaderContext)}
* (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a
* {@link #sloppyDocScorer(Stats, String, IndexReader.AtomicReaderContext)} (for queries with sloppy frequencies such as
* SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document.
* </ol> * </ol>
* <p>
* <a name="explaintime"/>
* When {@link IndexSearcher#explain(Query, int)} is called, queries consult the Similarity's DocScorer for an
* explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
* was computed.
* *
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider) * @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
* @lucene.experimental
*/ */
public abstract class Similarity { public abstract class Similarity {
public static final int NO_DOC_ID_PROVIDED = -1; public static final int NO_DOC_ID_PROVIDED = -1;
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** /**
* Computes the normalization value for a field, given the accumulated * Computes the normalization value for a field, given the accumulated
* state of term processing for this field (see {@link FieldInvertState}). * state of term processing for this field (see {@link FieldInvertState}).
* *
* <p>Implementations should calculate a float value based on the field * <p>Implementations should calculate a byte value based on the field
* state and then return that value. * state and then return that value.
* *
* <p>Matches in longer fields are less precise, so implementations of this * <p>Matches in longer fields are less precise, so implementations of this
* method usually return smaller values when <code>state.getLength()</code> is large, * method usually return smaller values when <code>state.getLength()</code> is large,
* and larger values when <code>state.getLength()</code> is small. * and larger values when <code>state.getLength()</code> is small.
* *
* <p>Note that the return values are computed under
* {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
* and then stored using
* {@link #encodeNormValue(float)}.
* Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* @lucene.experimental * @lucene.experimental
* *
* @param state current processing state for this field * @param state current processing state for this field
* @return the calculated float norm * @return the calculated byte norm
*/ */
public abstract float computeNorm(FieldInvertState state); public abstract byte computeNorm(FieldInvertState state);
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
/** Computes the amount of a sloppy phrase match, based on an edit distance. /** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form * This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to {@link #tf(float)}. * the frequency to be used in scoring instead of the exact term count.
* *
* <p>A phrase match with a small edit distance to a document passage more * <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually * closely matches the document, so implementations of this method usually
@ -619,124 +136,6 @@ public abstract class Similarity {
*/ */
public abstract float sloppyFreq(int distance); public abstract float sloppyFreq(int distance);
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param term the term in question
* @param searcher the document collection being searched
* @param docFreq externally computed docFreq for this term
* @return an IDFExplain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException {
final int df = docFreq;
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new IDFExplanation() {
@Override
public String explain() {
return "idf(docFreq=" + df +
", maxDocs=" + max + ")";
}
@Override
public float getIdf() {
return idf;
}};
}
/**
* This method forwards to {@link
* #idfExplain(Term,IndexSearcher,int)} by passing
* <code>searcher.docFreq(term)</code> as the docFreq.
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException {
return idfExplain(term, searcher, searcher.docFreq(term));
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final StringBuilder exp = new StringBuilder();
for (final Term term : terms ) {
final int df = searcher.docFreq(term);
idf += idf(df, max);
exp.append(" ");
exp.append(term.text());
exp.append("=");
exp.append(df);
}
final float fIdf = idf;
return new IDFExplanation() {
@Override
public float getIdf() {
return fIdf;
}
@Override
public String explain() {
return exp.toString();
}
};
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/** /**
* Calculate a scoring factor based on the data in the payload. Overriding implementations * Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
@ -759,4 +158,100 @@ public abstract class Similarity {
return 1; return 1;
} }
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
*/
public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException;
/**
* returns a new {@link Similarity.ExactDocScorer}.
*/
public abstract ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* returns a new {@link Similarity.SloppyDocScorer}.
*/
public abstract SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* API for scoring exact queries such as {@link TermQuery} and
* exact {@link PhraseQuery}.
* <p>
* Term frequencies are integers (the term or phrase's tf)
*/
public abstract class ExactDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq term frequency
* @return document's score
*/
public abstract float score(int doc, int freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, (int)freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/**
* API for scoring "sloppy" queries such as {@link SpanQuery} and
* sloppy {@link PhraseQuery}.
* <p>
* Term frequencies are floating point values.
*/
public abstract class SloppyDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq sloppy term frequency
* @return document's score
*/
public abstract float score(int doc, float freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the sloppy term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/** Stores the statistics for the indexed collection. This abstract
* implementation is empty; descendants of {@code Similarity} should
* subclass {@code Stats} and define the statistics they require in the
* subclass. Examples include idf, average field length, etc.
*/
public static abstract class Stats {
/** The value for normalization of contained query clauses (e.g. sum of squared weights).
* <p>
* NOTE: a Similarity implementation might not use any query normalization at all,
* its not required. However, if it wants to participate in query normalization,
* it can return a value here.
*/
public abstract float getValueForNormalization();
/** Assigns the query normalization factor and boost from parent queries to this.
* <p>
* NOTE: a Similarity implementation might not use this normalized value at all,
* its not required. However, its usually a good idea to at least incorporate
* the topLevelBoost (e.g. from an outer BooleanQuery) into its score.
*/
public abstract void normalize(float queryNorm, float topLevelBoost);
}
} }

View File

@ -25,11 +25,13 @@ final class SloppyPhraseScorer extends PhraseScorer {
private PhrasePositions repeats[]; private PhrasePositions repeats[];
private PhrasePositions tmpPos[]; // for flipping repeating pps. private PhrasePositions tmpPos[]; // for flipping repeating pps.
private boolean checkedRepeats; private boolean checkedRepeats;
private final Similarity similarity;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
int slop, byte[] norms) { int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, similarity, norms); super(weight, postings, docScorer);
this.slop = slop; this.slop = slop;
this.similarity = similarity;
} }
/** /**

View File

@ -0,0 +1,831 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
/**
* Implementation of {@link Similarity} with the Vector Space Model.
* <p>
* Expert: Scoring API.
* <p>TFIDFSimilarity defines the components of Lucene scoring.
* Overriding computation of these components is a convenient
* way to alter Lucene scoring.
*
* <p>Suggested reading:
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html">
* Introduction To Information Retrieval, Chapter 6</a>.
*
* <p>The following describes how Lucene scoring evolves from
* underlying information retrieval models to (efficient) implementation.
* We first brief on <i>VSM Score</i>,
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>,
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i>
* (the latter is connected directly with Lucene classes and methods).
*
* <p>Lucene combines
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model">
* Boolean model (BM) of Information Retrieval</a>
* with
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
* Vector Space Model (VSM) of Information Retrieval</a> -
* documents "approved" by BM are scored by VSM.
*
* <p>In VSM, documents and queries are represented as
* weighted vectors in a multi-dimensional space,
* where each distinct index term is a dimension,
* and weights are
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values.
*
* <p>VSM does not require weights to be <i>Tf-idf</i> values,
* but <i>Tf-idf</i> values are believed to produce search results of high quality,
* and so Lucene is using <i>Tf-idf</i>.
* <i>Tf</i> and <i>Idf</i> are described in more detail below,
* but for now, for completion, let's just say that
* for given term <i>t</i> and document (or query) <i>x</i>,
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i>
* (when one increases so does the other) and
* <i>idf(t)</i> similarly varies with the inverse of the
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <ol>
* <li>
* <A NAME="formula_tf"></A>
* <b><i>tf(t in d)</i></b>
* correlates to the term's <i>frequency</i>,
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
* Documents that have more occurrences of a given term receive a higher score.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation,
* However if a query contains twice the same term, there will be
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the Similarity in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p>
* When a document is added to the index, all the above factors are multiplied.
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
* &nbsp;&middot;&nbsp;
* lengthNorm
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&prod;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* </li>
* </ol>
*
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param stats statistics of the term in question
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param stats statistics of the terms in the phrase
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;
}
exp.setValue(idf);
return exp;
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
TermContext... termContexts) throws IOException {
final Explanation idf = termContexts.length == 1
? idfExplain(termContexts[0], searcher)
: idfExplain(termContexts, searcher);
return new IDFStats(idf, queryBoost);
}
@Override
public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new ExactTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
@Override
public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new SloppyTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
// TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot.
private final class ExactTFIDFDocScorer extends ExactDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
ExactTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = tf(i) * weightValue;
}
@Override
public float score(int doc, int freq) {
final float raw = // compute tf(f)*weight
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
private final class SloppyTFIDFDocScorer extends SloppyDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
SloppyTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
}
@Override
public float score(int doc, float freq) {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
private static class IDFStats extends Stats {
/** The idf and its explanation */
private final Explanation idf;
private float queryNorm;
private float queryWeight;
private final float queryBoost;
private float value;
public IDFStats(Explanation idf, float queryBoost) {
// TODO: Validate?
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
}
@Override
public float getValueForNormalization() {
// TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses?
return queryWeight * queryWeight; // sum of squared weights
}
@Override
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
}
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, byte[] norms) {
Explanation result = new Explanation();
result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight, product of:");
Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
if (stats.queryBoost != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(stats.idf);
Explanation queryNormExpl = new Explanation(stats.queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
stats.idf.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight in "+doc+
", product of:");
Explanation tfExplanation = new Explanation();
tfExplanation.setValue(tf(freq.getValue()));
tfExplanation.setDescription("tf(freq="+freq.getValue()+"), with freq of:");
tfExplanation.addDetail(freq);
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(stats.idf);
Explanation fieldNormExpl = new Explanation();
float fieldNorm =
norms!=null ? decodeNormValue(norms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
stats.idf.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
}
}

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod { abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
@ -43,7 +43,7 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
addClause(topLevel, term, docCount, boost, null); addClause(topLevel, term, docCount, boost, null);
} }
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException; protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, TermContext states) throws IOException;
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {

View File

@ -27,9 +27,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
@ -39,28 +39,19 @@ import org.apache.lucene.util.ToStringUtils;
public class TermQuery extends Query { public class TermQuery extends Query {
private final Term term; private final Term term;
private int docFreq; private int docFreq;
private transient PerReaderTermState perReaderTermState; private transient TermContext perReaderTermState;
private class TermWeight extends Weight { private class TermWeight extends Weight {
private final Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private final float idf; private transient TermContext termStates;
private float queryNorm;
private float queryWeight;
private final IDFExplanation idfExp;
private transient PerReaderTermState termStates;
public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq) public TermWeight(IndexSearcher searcher, TermContext termStates)
throws IOException { throws IOException {
assert termStates != null : "PerReaderTermState must not be null"; assert termStates != null : "TermContext must not be null";
this.termStates = termStates; this.termStates = termStates;
this.similarity = searcher.getSimilarityProvider().get(term.field()); this.similarity = searcher.getSimilarityProvider().get(term.field());
if (docFreq != -1) { this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates);
idfExp = similarity.idfExplain(term, searcher, docFreq);
} else {
idfExp = similarity.idfExplain(term, searcher);
}
idf = idfExp.getIdf();
} }
@Override @Override
@ -70,19 +61,13 @@ public class TermQuery extends Query {
public Query getQuery() { return TermQuery.this; } public Query getQuery() { return TermQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -97,7 +82,7 @@ public class TermQuery extends Query {
} }
final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state); final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state);
assert docs != null; assert docs != null;
return new TermScorer(this, docs, similarity, context.reader.norms(field)); return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context));
} }
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
@ -107,82 +92,28 @@ public class TermQuery extends Query {
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException { IndexReader reader = context.reader;
final IndexReader reader = context.reader;
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation expl = new Explanation(idf, idfExp.explain());
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(expl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
expl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
String field = term.field();
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
"), product of:");
Explanation tfExplanation = new Explanation();
int tf = 0;
DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes()); DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes());
if (docs != null) { if (docs != null) {
int newDoc = docs.advance(doc); int newDoc = docs.advance(doc);
if (newDoc == doc) { if (newDoc == doc) {
tf = docs.freq(); int freq = docs.freq();
} ExactDocScorer docScorer = similarity.exactDocScorer(stats, term.field(), context);
tfExplanation.setValue(similarity.tf(tf)); ComplexExplanation result = new ComplexExplanation();
tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
} else { Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq));
tfExplanation.setValue(0.0f); result.addDetail(scoreExplanation);
tfExplanation.setDescription("no matching term"); result.setValue(scoreExplanation.getValue());
} result.setMatch(true);
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(expl);
Explanation fieldNormExpl = new Explanation();
final byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
expl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result; return result;
} }
} }
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
/** Constructs a query for the term <code>t</code>. */ /** Constructs a query for the term <code>t</code>. */
public TermQuery(Term t) { public TermQuery(Term t) {
this(t, -1); this(t, -1);
@ -200,7 +131,7 @@ public class TermQuery extends Query {
/** Expert: constructs a TermQuery that will use the /** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq * provided docFreq instead of looking up the docFreq
* against the searcher. */ * against the searcher. */
public TermQuery(Term t, PerReaderTermState states) { public TermQuery(Term t, TermContext states) {
assert states != null; assert states != null;
term = t; term = t;
docFreq = states.docFreq(); docFreq = states.docFreq();
@ -213,20 +144,20 @@ public class TermQuery extends Query {
@Override @Override
public Weight createWeight(IndexSearcher searcher) throws IOException { public Weight createWeight(IndexSearcher searcher) throws IOException {
final ReaderContext context = searcher.getTopReaderContext(); final ReaderContext context = searcher.getTopReaderContext();
final int weightDocFreq; final TermContext termState;
final PerReaderTermState termState;
if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { if (perReaderTermState == null || perReaderTermState.topReaderContext != context) {
// make TermQuery single-pass if we don't have a PRTS or if the context differs! // make TermQuery single-pass if we don't have a PRTS or if the context differs!
termState = PerReaderTermState.build(context, term, true); // cache term lookups! termState = TermContext.build(context, term, true); // cache term lookups!
// we must not ignore the given docFreq - if set use the given value
weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq;
} else { } else {
// PRTS was pre-build for this IS // PRTS was pre-build for this IS
termState = this.perReaderTermState; termState = this.perReaderTermState;
weightDocFreq = docFreq;
} }
return new TermWeight(searcher, termState, weightDocFreq); // we must not ignore the given docFreq - if set use the given value (lie)
if (docFreq != -1)
termState.setDocFreq(docFreq);
return new TermWeight(searcher, termState);
} }
@Override @Override

View File

@ -25,20 +25,16 @@ import org.apache.lucene.index.DocsEnum;
*/ */
final class TermScorer extends Scorer { final class TermScorer extends Scorer {
private DocsEnum docsEnum; private DocsEnum docsEnum;
private byte[] norms;
private float weightValue;
private int doc = -1; private int doc = -1;
private int freq; private int freq;
private int pointer; private int pointer;
private int pointerMax; private int pointerMax;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
private int[] docs; private int[] docs;
private int[] freqs; private int[] freqs;
private final DocsEnum.BulkReadResult bulkResult; private final DocsEnum.BulkReadResult bulkResult;
private final Similarity similarity; private final Similarity.ExactDocScorer docScorer;
/** /**
* Construct a <code>TermScorer</code>. * Construct a <code>TermScorer</code>.
@ -47,22 +43,15 @@ final class TermScorer extends Scorer {
* The weight of the <code>Term</code> in the query. * The weight of the <code>Term</code> in the query.
* @param td * @param td
* An iterator over the documents matching the <code>Term</code>. * An iterator over the documents matching the <code>Term</code>.
* @param similarity * @param docScorer
* The </code>Similarity</code> implementation to be used for score * The </code>Similarity.ExactDocScorer</code> implementation
* computations. * to be used for score computations.
* @param norms
* The field norms of the document fields for the <code>Term</code>.
*/ */
TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { TermScorer(Weight weight, DocsEnum td, Similarity.ExactDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.docsEnum = td; this.docsEnum = td;
this.norms = norms;
this.weightValue = weight.getValue();
bulkResult = td.getBulkResult(); bulkResult = td.getBulkResult();
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = similarity.tf(i) * weightValue;
} }
@Override @Override
@ -134,12 +123,7 @@ final class TermScorer extends Scorer {
@Override @Override
public float score() { public float score() {
assert doc != NO_MORE_DOCS; assert doc != NO_MORE_DOCS;
float raw = // compute tf(f)*weight return docScorer.score(doc, freq);
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: similarity.tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field
} }
/** /**

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* Base rewrite method for collecting only the top terms * Base rewrite method for collecting only the top terms
@ -80,7 +80,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
this.termComp = termsEnum.getComparator(); this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor: // lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null) if (st == null)
st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext)); st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
} }
@ -101,14 +101,14 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
if (t != null) { if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ // if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums"; assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq()); t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else { } else {
// add new entry in PQ, we must clone the term, else it may get overwritten! // add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes); st.bytes.copy(bytes);
st.boost = boost; st.boost = boost;
visitedTerms.put(st.bytes, st); visitedTerms.put(st.bytes, st);
assert st.termState.docFreq() == 0; assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq()); st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st); stQueue.offer(st);
// possibly drop entries from queue // possibly drop entries from queue
if (stQueue.size() > maxSize) { if (stQueue.size() > maxSize) {
@ -116,7 +116,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
visitedTerms.remove(st.bytes); visitedTerms.remove(st.bytes);
st.termState.clear(); // reset the termstate! st.termState.clear(); // reset the termstate!
} else { } else {
st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext)); st = new ScoreTerm(termComp, new TermContext(topReaderContext));
} }
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
@ -171,8 +171,8 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
public final Comparator<BytesRef> termComp; public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef(); public final BytesRef bytes = new BytesRef();
public float boost; public float boost;
public final PerReaderTermState termState; public final TermContext termState;
public ScoreTerm(Comparator<BytesRef> termComp, PerReaderTermState termState) { public ScoreTerm(Comparator<BytesRef> termComp, TermContext termState) {
this.termComp = termComp; this.termComp = termComp;
this.termState = termState; this.termState = termState;
} }

View File

@ -41,11 +41,11 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
* <ol> * <ol>
* <li>A <code>Weight</code> is constructed by a top-level query, given a * <li>A <code>Weight</code> is constructed by a top-level query, given a
* <code>IndexSearcher</code> ({@link Query#createWeight(IndexSearcher)}). * <code>IndexSearcher</code> ({@link Query#createWeight(IndexSearcher)}).
* <li>The {@link #sumOfSquaredWeights()} method is called on the * <li>The {@link #getValueForNormalization()} method is called on the
* <code>Weight</code> to compute the query normalization factor * <code>Weight</code> to compute the query normalization factor
* {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the * {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the
* query. * query.
* <li>The query normalization factor is passed to {@link #normalize(float)}. At * <li>The query normalization factor is passed to {@link #normalize(float, float)}. At
* this point the weighting is complete. * this point the weighting is complete.
* <li>A <code>Scorer</code> is constructed by * <li>A <code>Scorer</code> is constructed by
* {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}. * {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}.
@ -68,11 +68,11 @@ public abstract class Weight {
/** The query that this concerns. */ /** The query that this concerns. */
public abstract Query getQuery(); public abstract Query getQuery();
/** The weight for this query. */ /** The value for normalization of contained query clauses (e.g. sum of squared weights). */
public abstract float getValue(); public abstract float getValueForNormalization() throws IOException;
/** Assigns the query normalization factor to this. */ /** Assigns the query normalization factor and boost from parent queries to this. */
public abstract void normalize(float norm); public abstract void normalize(float norm, float topLevelBoost);
/** /**
* Returns a {@link Scorer} which scores documents in/out-of order according * Returns a {@link Scorer} which scores documents in/out-of order according
@ -94,9 +94,6 @@ public abstract class Weight {
*/ */
public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException; public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException;
/** The sum of squared weights of contained query clauses. */
public abstract float sumOfSquaredWeights() throws IOException;
/** /**
* Returns true iff this implementation scores docs only out of order. This * Returns true iff this implementation scores docs only out of order. This
* method is used in conjunction with {@link Collector}'s * method is used in conjunction with {@link Collector}'s

View File

@ -18,11 +18,13 @@ package org.apache.lucene.search.payloads;
*/ */
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
@ -145,7 +147,35 @@ public class PayloadNearQuery extends SpanNearQuery {
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadNearSpanScorer(query.getSpans(context), this, return new PayloadNearSpanScorer(query.getSpans(context), this,
similarity, context.reader.norms(query.getField())); similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
expl.addDetail(scoreExplanation);
expl.setValue(scoreExplanation.getValue());
// now the payloads part
Explanation payloadExpl = function.explain(doc, scorer.payloadsSeen, scorer.payloadScore);
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
} }
} }
@ -155,8 +185,8 @@ public class PayloadNearQuery extends SpanNearQuery {
private int payloadsSeen; private int payloadsSeen;
protected PayloadNearSpanScorer(Spans spans, Weight weight, protected PayloadNearSpanScorer(Spans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException { Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, norms); super(spans, weight, similarity, docScorer);
this.spans = spans; this.spans = spans;
} }
@ -225,20 +255,6 @@ public class PayloadNearQuery extends SpanNearQuery {
return super.score() return super.score()
* function.docScore(doc, fieldName, payloadsSeen, payloadScore); * function.docScore(doc, fieldName, payloadsSeen, payloadScore);
} }
@Override
protected Explanation explain(int doc) throws IOException {
Explanation result = new Explanation();
// Add detail about tf/idf...
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
// Add detail about payload
Explanation payloadExpl = function.explain(doc, payloadsSeen, payloadScore);
result.addDetail(payloadExpl);
result.setValue(nonPayloadExpl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
} }
} }

View File

@ -26,6 +26,9 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.SpanWeight;
@ -76,7 +79,7 @@ public class PayloadTermQuery extends SpanTermQuery {
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context), return new PayloadTermSpanScorer((TermSpans) query.getSpans(context),
this, similarity, context.reader.norms(query.getField())); this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
} }
protected class PayloadTermSpanScorer extends SpanScorer { protected class PayloadTermSpanScorer extends SpanScorer {
@ -86,8 +89,8 @@ public class PayloadTermQuery extends SpanTermQuery {
private final TermSpans termSpans; private final TermSpans termSpans;
public PayloadTermSpanScorer(TermSpans spans, Weight weight, public PayloadTermSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException { Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, norms); super(spans, weight, similarity, docScorer);
termSpans = spans; termSpans = spans;
} }
@ -173,29 +176,40 @@ public class PayloadTermQuery extends SpanTermQuery {
protected float getPayloadScore() { protected float getPayloadScore() {
return function.docScore(doc, term.field(), payloadsSeen, payloadScore); return function.docScore(doc, term.field(), payloadsSeen, payloadScore);
} }
@Override
protected Explanation explain(final int doc) throws IOException {
ComplexExplanation result = new ComplexExplanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
// QUESTION: Is there a way to avoid this skipTo call? We need to know
// whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float payloadScore = getPayloadScore();
payloadBoost.setValue(payloadScore);
// GSI: I suppose we could toString the payload, but I don't think that
// would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * payloadScore);
result.setDescription("btq, product of:");
result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE
: Boolean.TRUE); // LUCENE-1303
return result;
} }
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
expl.addDetail(scoreExplanation);
expl.setValue(scoreExplanation.getValue());
// now the payloads part
// QUESTION: Is there a way to avoid this skipTo call? We need to know
// whether to load the payload or not
// GSI: I suppose we could toString the payload, but I don't think that
// would be a good idea
Explanation payloadExpl = new Explanation(scorer.getPayloadScore(), "scorePayload(...)");
payloadExpl.setValue(scorer.getPayloadScore());
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("btq, product of:");
result.setMatch(expl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
} }
} }

View File

@ -27,7 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite; import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.ScoringRewrite; import org.apache.lucene.search.ScoringRewrite;
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery}, * Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
@ -155,7 +155,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
} }
@Override @Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term); final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost); q.setBoost(boost);
topLevel.addClause(q); topLevel.addClause(q);
@ -204,7 +204,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
} }
@Override @Override
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term); final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost); q.setBoost(boost);
topLevel.addClause(q); topLevel.addClause(q);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.spans;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
@ -29,22 +30,21 @@ import org.apache.lucene.search.Similarity;
*/ */
public class SpanScorer extends Scorer { public class SpanScorer extends Scorer {
protected Spans spans; protected Spans spans;
protected byte[] norms;
protected float value;
protected boolean more = true; protected boolean more = true;
protected int doc; protected int doc;
protected float freq; protected float freq;
protected final Similarity similarity; protected final Similarity similarity;
protected final Similarity.SloppyDocScorer docScorer;
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer)
throws IOException { throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.similarity = similarity;
this.docScorer = docScorer;
this.spans = spans; this.spans = spans;
this.norms = norms;
this.value = weight.getValue();
if (this.spans.next()) { if (this.spans.next()) {
doc = -1; doc = -1;
} else { } else {
@ -94,27 +94,11 @@ public class SpanScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
float raw = similarity.tf(freq) * value; // raw score return docScorer.score(doc, freq);
return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize
} }
@Override @Override
public float freq() throws IOException { public float freq() throws IOException {
return freq; return freq;
} }
/** This method is no longer an official member of {@link Scorer},
* but it is needed by SpanWeight to build an explanation. */
protected Explanation explain(final int doc) throws IOException {
Explanation tfExplanation = new Explanation();
int expDoc = advance(doc);
float phraseFreq = (expDoc == doc) ? freq : 0.0f;
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
return tfExplanation;
}
} }

View File

@ -18,125 +18,76 @@ package org.apache.lucene.search.spans;
*/ */
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.TermContext;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet;
/** /**
* Expert-only. Public for use by other weight implementations * Expert-only. Public for use by other weight implementations
*/ */
public class SpanWeight extends Weight { public class SpanWeight extends Weight {
protected Similarity similarity; protected Similarity similarity;
protected float value;
protected float idf;
protected float queryNorm;
protected float queryWeight;
protected Set<Term> terms; protected Set<Term> terms;
protected SpanQuery query; protected SpanQuery query;
private IDFExplanation idfExp; protected Similarity.Stats stats;
public SpanWeight(SpanQuery query, IndexSearcher searcher) public SpanWeight(SpanQuery query, IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(query.getField()); this.similarity = searcher.getSimilarityProvider().get(query.getField());
this.query = query; this.query = query;
terms=new HashSet<Term>(); terms=new TreeSet<Term>();
query.extractTerms(terms); query.extractTerms(terms);
final ReaderContext context = searcher.getTopReaderContext();
idfExp = similarity.idfExplain(terms, searcher); final TermContext states[] = new TermContext[terms.size()];
idf = idfExp.getIdf(); int i = 0;
for (Term term : terms)
states[i++] = TermContext.build(context, term, true);
stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states);
} }
@Override @Override
public Query getQuery() { return query; } public Query getQuery() { return query; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() throws IOException {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = idf * query.getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new SpanScorer(query.getSpans(context), this, similarity, context.reader return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
.norms(query.getField()));
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException { Scorer scorer = scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
ComplexExplanation result = new ComplexExplanation(); ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
String field = ((SpanQuery)getQuery()).getField(); Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
Explanation idfExpl = result.setValue(scoreExplanation.getValue());
new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")"); result.setMatch(true);
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost");
if (getQuery().getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+
" in "+doc+"), product of:");
Explanation tfExpl = ((SpanScorer)scorer(context, ScorerContext.def())).explain(doc);
fieldExpl.addDetail(tfExpl);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
fieldExpl.setValue(tfExpl.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result; return result;
} }
} }
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}

View File

@ -28,25 +28,27 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.TermsEnum.SeekStatus;
/** /**
* Maintains a {@link IndexReader} {@link TermState} view over * Maintains a {@link IndexReader} {@link TermState} view over
* {@link IndexReader} instances containing a single term. The * {@link IndexReader} instances containing a single term. The
* {@link PerReaderTermState} doesn't track if the given {@link TermState} * {@link TermContext} doesn't track if the given {@link TermState}
* objects are valid, neither if the {@link TermState} instances refer to the * objects are valid, neither if the {@link TermState} instances refer to the
* same terms in the associated readers. * same terms in the associated readers.
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class PerReaderTermState { public final class TermContext {
public final ReaderContext topReaderContext; // for asserting! public final ReaderContext topReaderContext; // for asserting!
private final TermState[] states; private final TermState[] states;
private int docFreq; private int docFreq;
private long totalTermFreq;
/** /**
* Creates an empty {@link PerReaderTermState} from a {@link ReaderContext} * Creates an empty {@link TermContext} from a {@link ReaderContext}
*/ */
public PerReaderTermState(ReaderContext context) { public TermContext(ReaderContext context) {
assert context != null && context.isTopLevel; assert context != null && context.isTopLevel;
topReaderContext = context; topReaderContext = context;
docFreq = 0; docFreq = 0;
@ -60,28 +62,28 @@ public final class PerReaderTermState {
} }
/** /**
* Creates a {@link PerReaderTermState} with an initial {@link TermState}, * Creates a {@link TermContext} with an initial {@link TermState},
* {@link IndexReader} pair. * {@link IndexReader} pair.
*/ */
public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) { public TermContext(ReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
this(context); this(context);
register(state, ord, docFreq); register(state, ord, docFreq, totalTermFreq);
} }
/** /**
* Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the * Creates a {@link TermContext} from a top-level {@link ReaderContext} and the
* given {@link Term}. This method will lookup the given term in all context's leaf readers * given {@link Term}. This method will lookup the given term in all context's leaf readers
* and register each of the readers containing the term in the returned {@link PerReaderTermState} * and register each of the readers containing the term in the returned {@link TermContext}
* using the leaf reader's ordinal. * using the leaf reader's ordinal.
* <p> * <p>
* Note: the given context must be a top-level context. * Note: the given context must be a top-level context.
*/ */
public static PerReaderTermState build(ReaderContext context, Term term, boolean cache) public static TermContext build(ReaderContext context, Term term, boolean cache)
throws IOException { throws IOException {
assert context != null && context.isTopLevel; assert context != null && context.isTopLevel;
final String field = term.field(); final String field = term.field();
final BytesRef bytes = term.bytes(); final BytesRef bytes = term.bytes();
final PerReaderTermState perReaderTermState = new PerReaderTermState(context); final TermContext perReaderTermState = new TermContext(context);
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
for (int i = 0; i < leaves.length; i++) { for (int i = 0; i < leaves.length; i++) {
final Fields fields = leaves[i].reader.fields(); final Fields fields = leaves[i].reader.fields();
@ -91,7 +93,7 @@ public final class PerReaderTermState {
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
if (termsEnum.seekExact(bytes, cache)) { if (termsEnum.seekExact(bytes, cache)) {
final TermState termState = termsEnum.termState(); final TermState termState = termsEnum.termState();
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
} }
} }
@ -100,7 +102,7 @@ public final class PerReaderTermState {
} }
/** /**
* Clears the {@link PerReaderTermState} internal state and removes all * Clears the {@link TermContext} internal state and removes all
* registered {@link TermState}s * registered {@link TermState}s
*/ */
public void clear() { public void clear() {
@ -112,12 +114,16 @@ public final class PerReaderTermState {
* Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal * Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal
* should be derived from a {@link ReaderContext}'s leaf ord. * should be derived from a {@link ReaderContext}'s leaf ord.
*/ */
public void register(TermState state, final int ord, final int docFreq) { public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
assert state != null : "state must not be null"; assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length; assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord assert states[ord] == null : "state for ord: " + ord
+ " already registered"; + " already registered";
this.docFreq += docFreq; this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
this.totalTermFreq += totalTermFreq;
else
this.totalTermFreq = -1;
states[ord] = state; states[ord] = state;
} }
@ -137,11 +143,27 @@ public final class PerReaderTermState {
/** /**
* Returns the accumulated document frequency of all {@link TermState} * Returns the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}. * instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated document frequency of all {@link TermState} * @return the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}. * instances passed to {@link #register(TermState, int, int, long)}.
*/ */
public int docFreq() { public int docFreq() {
return docFreq; return docFreq;
} }
/**
* Returns the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public long totalTermFreq() {
return totalTermFreq;
}
/** expert: only available for queries that want to lie about docfreq
* @lucene.internal */
public void setDocFreq(int docFreq) {
this.docFreq = docFreq;
}
} }

View File

@ -62,12 +62,7 @@ public class AssertingIndexSearcher extends IndexSearcher {
} }
@Override @Override
public float getValue() { public void normalize(float norm, float topLevelBoost) {
return w.getValue();
}
@Override
public void normalize(float norm) {
throw new IllegalStateException("Weight already normalized."); throw new IllegalStateException("Weight already normalized.");
} }
@ -77,7 +72,7 @@ public class AssertingIndexSearcher extends IndexSearcher {
} }
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
throw new IllegalStateException("Weight already normalized."); throw new IllegalStateException("Weight already normalized.");
} }

View File

@ -329,8 +329,9 @@ public class CheckHits {
Explanation detail[] = expl.getDetails(); Explanation detail[] = expl.getDetails();
if (detail!=null) { if (detail!=null) {
if (detail.length==1) { if (detail.length==1) {
// simple containment, no matter what the description says, // simple containment, unless its a freq of: (which lets a query explain how the freq is calculated),
// just verify contained expl has same score // just verify contained expl has same score
if (!expl.getDescription().endsWith("with freq of:"))
verifyExplanation(q,doc,score,deep,detail[0]); verifyExplanation(q,doc,score,deep,detail[0]);
} else { } else {
// explanation must either: // explanation must either:
@ -357,6 +358,7 @@ public class CheckHits {
} }
} }
} }
// TODO: this is a TERRIBLE assertion!!!!
Assert.assertTrue( Assert.assertTrue(
q+": multi valued explanation description=\""+descr q+": multi valued explanation description=\""+descr
+"\" must be 'max of plus x times others' or end with 'product of'" +"\" must be 'max of plus x times others' or end with 'product of'"

View File

@ -38,7 +38,6 @@ import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
@ -375,7 +374,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
Term searchTerm = new Term("id", "6"); Term searchTerm = new Term("id", "6");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("wrong delete count", 1, delCount); assertEquals("wrong delete count", 1, delCount);
reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", sim.encodeNormValue(2.0f));
reader.close(); reader.close();
searcher.close(); searcher.close();
@ -421,7 +421,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
Term searchTerm = new Term("id", "6"); Term searchTerm = new Term("id", "6");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("wrong delete count", 1, delCount); assertEquals("wrong delete count", 1, delCount);
reader.setNorm(22, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(22, "content", sim.encodeNormValue(2.0f));
reader.close(); reader.close();
// make sure they "took": // make sure they "took":
@ -483,7 +484,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();
} }
@ -526,7 +528,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
@ -655,7 +656,8 @@ public class TestDeletionPolicy extends LuceneTestCase {
writer.close(); writer.close();
IndexReader reader = IndexReader.open(dir, policy, false); IndexReader reader = IndexReader.open(dir, policy, false);
reader.deleteDocument(3*i+1); reader.deleteDocument(3*i+1);
reader.setNorm(4*i+1, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(4*i+1, "content", sim.encodeNormValue(2.0F));
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(16*(1+i), hits.length); assertEquals(16*(1+i), hits.length);
@ -781,7 +783,8 @@ public class TestDeletionPolicy extends LuceneTestCase {
writer.close(); writer.close();
IndexReader reader = IndexReader.open(dir, policy, false); IndexReader reader = IndexReader.open(dir, policy, false);
reader.deleteDocument(3); reader.deleteDocument(3);
reader.setNorm(5, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(5, "content", sim.encodeNormValue(2.0F));
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(16, hits.length); assertEquals(16, hits.length);

View File

@ -71,7 +71,7 @@ public class TestIndexFileDeleter extends LuceneTestCase {
Term searchTerm = new Term("id", "7"); Term searchTerm = new Term("id", "7");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();

View File

@ -421,7 +421,7 @@ public class TestIndexReader extends LuceneTestCase
// expected // expected
} }
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm after close failed to throw IOException"); fail("setNorm after close failed to throw IOException");
@ -462,7 +462,7 @@ public class TestIndexReader extends LuceneTestCase
// expected // expected
} }
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm should have hit LockObtainFailedException"); fail("setNorm should have hit LockObtainFailedException");
@ -494,7 +494,7 @@ public class TestIndexReader extends LuceneTestCase
// now open reader & set norm for doc 0 // now open reader & set norm for doc 0
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); reader.setNorm(0, "content", sim.encodeNormValue(2.0f));
// we should be holding the write lock now: // we should be holding the write lock now:
@ -539,7 +539,7 @@ public class TestIndexReader extends LuceneTestCase
addDoc(writer, searchTerm.text()); addDoc(writer, searchTerm.text());
writer.close(); writer.close();
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
// now open reader & set norm for doc 0 (writes to // now open reader & set norm for doc 0 (writes to
// _0_1.s0) // _0_1.s0)
reader = IndexReader.open(dir, false); reader = IndexReader.open(dir, false);
@ -738,7 +738,7 @@ public class TestIndexReader extends LuceneTestCase
} }
reader = IndexReader.open(dir, false); reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(1, "content", sim.encodeNormValue(2.0f)); reader.setNorm(1, "content", sim.encodeNormValue(2.0f));
fail("did not hit exception when calling setNorm on an invalid doc number"); fail("did not hit exception when calling setNorm on an invalid doc number");

View File

@ -273,7 +273,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
* @throws Exception * @throws Exception
*/ */
private void performDefaultTests(IndexReader r1) throws Exception { private void performDefaultTests(IndexReader r1) throws Exception {
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]); float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]);
IndexReader pr1Clone = (IndexReader) r1.clone(); IndexReader pr1Clone = (IndexReader) r1.clone();
@ -329,7 +329,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
TestIndexReaderReopen.createIndex(random, dir1, false); TestIndexReaderReopen.createIndex(random, dir1, false);
SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false)); SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false));
origSegmentReader.deleteDocument(1); origSegmentReader.deleteDocument(1);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f)); origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f));
SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader
@ -429,7 +429,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
final Directory dir1 = newDirectory(); final Directory dir1 = newDirectory();
TestIndexReaderReopen.createIndex(random, dir1, false); TestIndexReaderReopen.createIndex(random, dir1, false);
IndexReader orig = IndexReader.open(dir1, false); IndexReader orig = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
orig.setNorm(1, "field1", sim.encodeNormValue(17.0f)); orig.setNorm(1, "field1", sim.encodeNormValue(17.0f));
final byte encoded = sim.encodeNormValue(17.0f); final byte encoded = sim.encodeNormValue(17.0f);
assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]); assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]);

View File

@ -47,9 +47,9 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// diable length norm // diable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
}; };
} }
@ -217,7 +217,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
IndexReader reader4C = (IndexReader) reader3C.clone(); IndexReader reader4C = (IndexReader) reader3C.clone();
SegmentReader segmentReader4C = getOnlySegmentReader(reader4C); SegmentReader segmentReader4C = getOnlySegmentReader(reader4C);
assertEquals(4, reader3CCNorm.bytesRef().get()); assertEquals(4, reader3CCNorm.bytesRef().get());
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f)); reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f));
// generate a cannot update exception in reader1 // generate a cannot update exception in reader1
@ -278,7 +278,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
// System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); // System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm);
modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(i, Float.valueOf(newNorm));
modifiedNorms.set(k, Float.valueOf(origNorm)); modifiedNorms.set(k, Float.valueOf(origNorm));
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm)); ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm));
ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm)); ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm));
// System.out.println("setNorm i: "+i); // System.out.println("setNorm i: "+i);
@ -300,7 +300,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
assertEquals("number of norms mismatches", numDocNorms, b.length); assertEquals("number of norms mismatches", numDocNorms, b.length);
ArrayList<Float> storedNorms = (i == 1 ? modifiedNorms : norms); ArrayList<Float> storedNorms = (i == 1 ? modifiedNorms : norms);
for (int j = 0; j < b.length; j++) { for (int j = 0; j < b.length; j++) {
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
float norm = sim.decodeNormValue(b[j]); float norm = sim.decodeNormValue(b[j]);
float norm1 = storedNorms.get(j).floatValue(); float norm1 = storedNorms.get(j).floatValue();
assertEquals("stored norm value of " + field + " for doc " + j + " is " assertEquals("stored norm value of " + field + " for doc " + j + " is "
@ -340,7 +340,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
// return unique norm values that are unchanged by encoding/decoding // return unique norm values that are unchanged by encoding/decoding
private float nextNorm(String fname) { private float nextNorm(String fname) {
float norm = lastNorm + normDelta; float norm = lastNorm + normDelta;
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
do { do {
float norm1 = sim.decodeNormValue( float norm1 = sim.decodeNormValue(
sim.encodeNormValue(norm)); sim.encodeNormValue(norm));

View File

@ -131,7 +131,7 @@ public class TestIndexReaderOnDiskFull extends LuceneTestCase {
dir.setMaxSizeInBytes(thisDiskFree); dir.setMaxSizeInBytes(thisDiskFree);
dir.setRandomIOExceptionRate(rate); dir.setRandomIOExceptionRate(rate);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
if (0 == x) { if (0 == x) {
int docId = 12; int docId = 12;

View File

@ -606,7 +606,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
IndexReader reader2 = reader1.reopen(); IndexReader reader2 = reader1.reopen();
modifier = IndexReader.open(dir1, false); modifier = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(1, "field1", sim.encodeNormValue(50f)); modifier.setNorm(1, "field1", sim.encodeNormValue(50f));
modifier.setNorm(1, "field2", sim.encodeNormValue(50f)); modifier.setNorm(1, "field2", sim.encodeNormValue(50f));
modifier.close(); modifier.close();
@ -702,7 +702,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
protected void modifyIndex(int i) throws IOException { protected void modifyIndex(int i) throws IOException {
if (i % 3 == 0) { if (i % 3 == 0) {
IndexReader modifier = IndexReader.open(dir, false); IndexReader modifier = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(i, "field1", sim.encodeNormValue(50f)); modifier.setNorm(i, "field1", sim.encodeNormValue(50f));
modifier.close(); modifier.close();
} else if (i % 3 == 1) { } else if (i % 3 == 1) {
@ -983,7 +983,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
} }
case 1: { case 1: {
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(4, "field1", sim.encodeNormValue(123f)); reader.setNorm(4, "field1", sim.encodeNormValue(123f));
reader.setNorm(44, "field2", sim.encodeNormValue(222f)); reader.setNorm(44, "field2", sim.encodeNormValue(222f));
reader.setNorm(44, "field4", sim.encodeNormValue(22f)); reader.setNorm(44, "field4", sim.encodeNormValue(22f));
@ -1007,7 +1007,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
} }
case 4: { case 4: {
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(5, "field1", sim.encodeNormValue(123f)); reader.setNorm(5, "field1", sim.encodeNormValue(123f));
reader.setNorm(55, "field2", sim.encodeNormValue(222f)); reader.setNorm(55, "field2", sim.encodeNormValue(222f));
reader.close(); reader.close();

View File

@ -116,8 +116,8 @@ public class TestMaxTermFrequency extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return (float) state.getMaxTermFrequency(); return encodeNormValue((float) state.getMaxTermFrequency());
} }
} }
} }

View File

@ -46,9 +46,9 @@ public class TestNorms extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// diable length norm // diable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
}; };
} }
@ -177,7 +177,7 @@ public class TestNorms extends LuceneTestCase {
//System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm);
modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(i, Float.valueOf(newNorm));
modifiedNorms.set(k, Float.valueOf(origNorm)); modifiedNorms.set(k, Float.valueOf(origNorm));
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm)); ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm));
ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm)); ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm));
} }
@ -192,8 +192,9 @@ public class TestNorms extends LuceneTestCase {
byte b[] = MultiNorms.norms(ir, field); byte b[] = MultiNorms.norms(ir, field);
assertEquals("number of norms mismatches",numDocNorms,b.length); assertEquals("number of norms mismatches",numDocNorms,b.length);
ArrayList<Float> storedNorms = (i==1 ? modifiedNorms : norms); ArrayList<Float> storedNorms = (i==1 ? modifiedNorms : norms);
DefaultSimilarity sim = (DefaultSimilarity) similarityProviderOne.get(field);
for (int j = 0; j < b.length; j++) { for (int j = 0; j < b.length; j++) {
float norm = similarityProviderOne.get(field).decodeNormValue(b[j]); float norm = sim.decodeNormValue(b[j]);
float norm1 = storedNorms.get(j).floatValue(); float norm1 = storedNorms.get(j).floatValue();
assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001);
} }
@ -229,7 +230,7 @@ public class TestNorms extends LuceneTestCase {
// return unique norm values that are unchanged by encoding/decoding // return unique norm values that are unchanged by encoding/decoding
private float nextNorm(String fname) { private float nextNorm(String fname) {
float norm = lastNorm + normDelta; float norm = lastNorm + normDelta;
Similarity similarity = similarityProviderOne.get(fname); DefaultSimilarity similarity = (DefaultSimilarity) similarityProviderOne.get(fname);
do { do {
float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm)); float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm));
if (norm1 > lastNorm) { if (norm1 > lastNorm) {
@ -259,8 +260,8 @@ public class TestNorms extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return (float) state.getLength(); return encodeNormValue((float) state.getLength());
} }
} }

View File

@ -18,9 +18,9 @@ package org.apache.lucene.index;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
@ -30,7 +30,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.search.Explanation.IDFExplanation;
public class TestOmitTf extends LuceneTestCase { public class TestOmitTf extends LuceneTestCase {
@ -39,23 +38,14 @@ public class TestOmitTf extends LuceneTestCase {
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
public float coord(int overlap, int maxOverlap) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; }
public Similarity get(String field) { public Similarity get(String field) {
return new Similarity() { return new TFIDFSimilarity() {
@Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); }
@Override public float tf(float freq) { return freq; } @Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException { @Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException {
return new IDFExplanation() { return new Explanation(1.0f, "Inexplicable");
@Override
public float getIdf() {
return 1.0f;
}
@Override
public String explain() {
return "Inexplicable";
}
};
} }
}; };
} }

View File

@ -149,7 +149,7 @@ public class TestParallelReader extends LuceneTestCase {
assertTrue(pr.isCurrent()); assertTrue(pr.isCurrent());
IndexReader modifier = IndexReader.open(dir1, false); IndexReader modifier = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(0, "f1", sim.encodeNormValue(100f)); modifier.setNorm(0, "f1", sim.encodeNormValue(100f));
modifier.close(); modifier.close();

View File

@ -20,7 +20,11 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Similarity.Stats;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
@ -187,8 +191,8 @@ final class JustCompileSearch {
static final class JustCompilePhraseScorer extends PhraseScorer { static final class JustCompilePhraseScorer extends PhraseScorer {
JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) { Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, similarity, norms); super(weight, postings, docScorer);
} }
@Override @Override
@ -243,12 +247,22 @@ final class JustCompileSearch {
static final class JustCompileSimilarity extends Similarity { static final class JustCompileSimilarity extends Similarity {
@Override @Override
public float idf(int docFreq, int numDocs) { public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public byte computeNorm(FieldInvertState state) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@ -256,11 +270,6 @@ final class JustCompileSearch {
public float sloppyFreq(int distance) { public float sloppyFreq(int distance) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override
public float tf(float freq) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
} }
static final class JustCompileSimilarityProvider implements SimilarityProvider { static final class JustCompileSimilarityProvider implements SimilarityProvider {
@ -348,17 +357,12 @@ final class JustCompileSearch {
} }
@Override @Override
public float getValue() { public void normalize(float norm, float topLevelBoost) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override @Override
public void normalize(float norm) { public float getValueForNormalization() throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public float sumOfSquaredWeights() throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }

View File

@ -62,9 +62,9 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// Disable length norm // Disable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
@Override @Override

View File

@ -0,0 +1,203 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IndexDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
/**
* Tests the use of indexdocvalues in scoring.
*
* In the example, a docvalues field is used as a per-document boost (separate from the norm)
* @lucene.experimental
*/
public class TestDocValuesScoring extends LuceneTestCase {
private static final float SCORE_EPSILON = 0.001f; /* for comparing floats */
public void testSimple() throws Exception {
assumeFalse("PreFlex codec cannot work with IndexDocValues!",
"PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
Document doc = new Document();
Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field);
IndexDocValuesField dvField = new IndexDocValuesField("foo_boost");
doc.add(dvField);
Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field2);
field.setValue("quick brown fox");
field2.setValue("quick brown fox");
dvField.setFloat(2f); // boost x2
iw.addDocument(doc);
field.setValue("jumps over lazy brown dog");
field2.setValue("jumps over lazy brown dog");
dvField.setFloat(4f); // boost x4
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
// no boosting
IndexSearcher searcher1 = newSearcher(ir);
// boosting
IndexSearcher searcher2 = newSearcher(ir);
searcher2.setSimilarityProvider(new DefaultSimilarityProvider() {
final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost");
public Similarity get(String field) {
return "foo".equals(field) ? fooSim : super.get(field);
}
});
// in this case, we searched on field "foo". first document should have 2x the score.
TermQuery tq = new TermQuery(new Term("foo", "quick"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
TopDocs noboost = searcher1.search(tq, 10);
TopDocs boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
//System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc));
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*2f, SCORE_EPSILON);
// this query matches only the second document, which should have 4x the score.
tq = new TermQuery(new Term("foo", "jumps"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
noboost = searcher1.search(tq, 10);
boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*4f, SCORE_EPSILON);
// search on on field bar just for kicks, nothing should happen, since we setup
// our sim provider to only use foo_boost for field foo.
tq = new TermQuery(new Term("bar", "quick"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
noboost = searcher1.search(tq, 10);
boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON);
searcher1.close();
searcher2.close();
ir.close();
dir.close();
}
/**
* Similarity that wraps another similarity and boosts the final score
* according to whats in a docvalues field.
*
* @lucene.experimental
*/
static class BoostingSimilarity extends Similarity {
private final Similarity sim;
private final String boostField;
public BoostingSimilarity(Similarity sim, String boostField) {
this.sim = sim;
this.boostField = boostField;
}
@Override
public byte computeNorm(FieldInvertState state) {
return sim.computeNorm(state);
}
@Override
public float sloppyFreq(int distance) {
return sim.sloppyFreq(distance);
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
return sim.computeStats(searcher, fieldName, queryBoost, termContexts);
}
@Override
public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
final ExactDocScorer sub = sim.exactDocScorer(stats, fieldName, context);
final Source values = context.reader.docValues(boostField).getSource();
return new ExactDocScorer() {
@Override
public float score(int doc, int freq) {
return (float) values.getFloat(doc) * sub.score(doc, freq);
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")");
Explanation simExplanation = sub.explain(doc, freq);
Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:");
expl.addDetail(boostExplanation);
expl.addDetail(simExplanation);
return expl;
}
};
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
final SloppyDocScorer sub = sim.sloppyDocScorer(stats, fieldName, context);
final Source values = context.reader.docValues(boostField).getSource();
return new SloppyDocScorer() {
@Override
public float score(int doc, float freq) {
return (float) values.getFloat(doc) * sub.score(doc, freq);
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")");
Explanation simExplanation = sub.explain(doc, freq);
Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:");
expl.addDetail(boostExplanation);
expl.addDetail(simExplanation);
return expl;
}
};
}
}
}

View File

@ -49,34 +49,12 @@ public class TestMatchAllDocsQuery extends LuceneTestCase {
IndexSearcher is = newSearcher(ir); IndexSearcher is = newSearcher(ir);
ScoreDoc[] hits; ScoreDoc[] hits;
// assert with norms scoring turned off
hits = is.search(new MatchAllDocsQuery(), null, 1000).scoreDocs; hits = is.search(new MatchAllDocsQuery(), null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals("one", is.doc(hits[0].doc).get("key")); assertEquals("one", is.doc(hits[0].doc).get("key"));
assertEquals("two", is.doc(hits[1].doc).get("key")); assertEquals("two", is.doc(hits[1].doc).get("key"));
assertEquals("three four", is.doc(hits[2].doc).get("key")); assertEquals("three four", is.doc(hits[2].doc).get("key"));
// assert with norms scoring turned on
MatchAllDocsQuery normsQuery = new MatchAllDocsQuery("key");
hits = is.search(normsQuery, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals("three four", is.doc(hits[0].doc).get("key"));
assertEquals("two", is.doc(hits[1].doc).get("key"));
assertEquals("one", is.doc(hits[2].doc).get("key"));
// change norm & retest
is.getIndexReader().setNorm(0, "key", is.getSimilarityProvider().get("key").encodeNormValue(400f));
normsQuery = new MatchAllDocsQuery("key");
hits = is.search(normsQuery, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals("one", is.doc(hits[0].doc).get("key"));
assertEquals("three four", is.doc(hits[1].doc).get("key"));
assertEquals("two", is.doc(hits[2].doc).get("key"));
// some artificial queries to trigger the use of skipTo(): // some artificial queries to trigger the use of skipTo():
BooleanQuery bq = new BooleanQuery(); BooleanQuery bq = new BooleanQuery();

View File

@ -24,9 +24,9 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -312,21 +312,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public IDFExplanation idfExplain(Collection<Term> terms, public Explanation idfExplain(TermContext stats[],
IndexSearcher searcher) throws IOException { IndexSearcher searcher) throws IOException {
return new IDFExplanation() { return new Explanation(10f, "just a test");
@Override
public float getIdf() {
return 10f;
}
@Override
public String explain() {
return "just a test";
}
};
} }
}; };
} }
@ -336,7 +324,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
query.add(new Term[] { new Term("body", "this"), new Term("body", "that") }); query.add(new Term[] { new Term("body", "this"), new Term("body", "that") });
query.add(new Term("body", "is")); query.add(new Term("body", "is"));
Weight weight = query.createWeight(searcher); Weight weight = query.createWeight(searcher);
assertEquals(10f * 10f, weight.sumOfSquaredWeights(), 0.001f); assertEquals(10f * 10f, weight.getValueForNormalization(), 0.001f);
writer.close(); writer.close();
searcher.close(); searcher.close();

View File

@ -50,7 +50,7 @@ public class TestSetNorm extends LuceneTestCase {
// reset the boost of each instance of this document // reset the boost of each instance of this document
IndexReader reader = IndexReader.open(store, false); IndexReader reader = IndexReader.open(store, false);
Similarity similarity = new DefaultSimilarity(); DefaultSimilarity similarity = new DefaultSimilarity();
reader.setNorm(0, "field", similarity.encodeNormValue(1.0f)); reader.setNorm(0, "field", similarity.encodeNormValue(1.0f));
reader.setNorm(1, "field", similarity.encodeNormValue(2.0f)); reader.setNorm(1, "field", similarity.encodeNormValue(2.0f));
reader.setNorm(2, "field", similarity.encodeNormValue(4.0f)); reader.setNorm(2, "field", similarity.encodeNormValue(4.0f));

View File

@ -18,8 +18,9 @@ package org.apache.lucene.search;
*/ */
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
@ -30,7 +31,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.search.Explanation.IDFExplanation;
/** Similarity unit test. /** Similarity unit test.
* *
@ -42,22 +42,13 @@ public class TestSimilarity extends LuceneTestCase {
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
public float coord(int overlap, int maxOverlap) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; }
public Similarity get(String field) { public Similarity get(String field) {
return new Similarity() { return new DefaultSimilarity() {
@Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); }
@Override public float tf(float freq) { return freq; } @Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException { @Override public Explanation idfExplain(TermContext[] stats, IndexSearcher searcher) throws IOException {
return new IDFExplanation() { return new Explanation(1.0f, "Inexplicable");
@Override
public float getIdf() {
return 1.0f;
}
@Override
public String explain() {
return "Inexplicable";
}
};
} }
}; };
} }

View File

@ -105,10 +105,10 @@ public class TestSimilarityProvider extends LuceneTestCase {
} }
} }
private class Sim1 extends Similarity { private class Sim1 extends TFIDFSimilarity {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return 1f; return encodeNormValue(1f);
} }
@Override @Override
@ -127,10 +127,10 @@ public class TestSimilarityProvider extends LuceneTestCase {
} }
} }
private class Sim2 extends Similarity { private class Sim2 extends TFIDFSimilarity {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return 10f; return encodeNormValue(10f);
} }
@Override @Override

View File

@ -17,7 +17,6 @@ package org.apache.lucene.search.payloads;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Collection;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
@ -45,7 +44,7 @@ import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English; import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.TermContext;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -325,8 +324,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
//Make everything else 1 so we see the effect of the payload //Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
@Override @Override
@ -341,18 +340,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
// idf used for phrase queries // idf used for phrase queries
@Override @Override
public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException { public Explanation idfExplain(TermContext states[], IndexSearcher searcher) throws IOException {
return new IDFExplanation() { return new Explanation(1.0f, "Inexplicable");
@Override
public float getIdf() {
return 1.0f;
}
@Override
public String explain() {
return "Inexplicable";
}
};
} }
}; };
} }

View File

@ -318,8 +318,8 @@ public class TestPayloadTermQuery extends LuceneTestCase {
//Make everything else 1 so we see the effect of the payload //Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
@Override @Override

View File

@ -135,8 +135,8 @@ final class JustCompileSearchSpans {
static final class JustCompileSpanScorer extends SpanScorer { static final class JustCompileSpanScorer extends SpanScorer {
protected JustCompileSpanScorer(Spans spans, Weight weight, protected JustCompileSpanScorer(Spans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException { Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, norms); super(spans, weight, similarity, docScorer);
} }
@Override @Override

View File

@ -133,18 +133,13 @@ public class BlockJoinQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return childWeight.getValue(); return childWeight.getValueForNormalization();
} }
@Override @Override
public float sumOfSquaredWeights() throws IOException { public void normalize(float norm, float topLevelBoost) {
return childWeight.sumOfSquaredWeights(); childWeight.normalize(norm, topLevelBoost);
}
@Override
public void normalize(float norm) {
childWeight.normalize(norm);
} }
@Override @Override

View File

@ -195,21 +195,14 @@ public class CustomScoreQuery extends Query {
return CustomScoreQuery.this; return CustomScoreQuery.this;
} }
/*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return getBoost(); float sum = subQueryWeight.getValueForNormalization();
}
/*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */
@Override
public float sumOfSquaredWeights() throws IOException {
float sum = subQueryWeight.sumOfSquaredWeights();
for(int i = 0; i < valSrcWeights.length; i++) { for(int i = 0; i < valSrcWeights.length; i++) {
if (qStrict) { if (qStrict) {
valSrcWeights[i].sumOfSquaredWeights(); // do not include ValueSource part in the query normalization valSrcWeights[i].getValueForNormalization(); // do not include ValueSource part in the query normalization
} else { } else {
sum += valSrcWeights[i].sumOfSquaredWeights(); sum += valSrcWeights[i].getValueForNormalization();
} }
} }
sum *= getBoost() * getBoost(); // boost each sub-weight sum *= getBoost() * getBoost(); // boost each sub-weight
@ -218,14 +211,14 @@ public class CustomScoreQuery extends Query {
/*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); // incorporate boost topLevelBoost *= getBoost(); // incorporate boost
subQueryWeight.normalize(norm); subQueryWeight.normalize(norm, topLevelBoost);
for(int i = 0; i < valSrcWeights.length; i++) { for(int i = 0; i < valSrcWeights.length; i++) {
if (qStrict) { if (qStrict) {
valSrcWeights[i].normalize(1); // do not normalize the ValueSource part valSrcWeights[i].normalize(1, 1); // do not normalize the ValueSource part
} else { } else {
valSrcWeights[i].normalize(norm); valSrcWeights[i].normalize(norm, topLevelBoost);
} }
} }
} }
@ -245,7 +238,7 @@ public class CustomScoreQuery extends Query {
for(int i = 0; i < valSrcScorers.length; i++) { for(int i = 0; i < valSrcScorers.length; i++) {
valSrcScorers[i] = valSrcWeights[i].scorer(context, scorerContext.scoreDocsInOrder(true)); valSrcScorers[i] = valSrcWeights[i].scorer(context, scorerContext.scoreDocsInOrder(true));
} }
return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, subQueryScorer, valSrcScorers); return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, getBoost(), subQueryScorer, valSrcScorers);
} }
@Override @Override
@ -265,11 +258,11 @@ public class CustomScoreQuery extends Query {
valSrcExpls[i] = valSrcWeights[i].explain(info, doc); valSrcExpls[i] = valSrcWeights[i].explain(info, doc);
} }
Explanation customExp = CustomScoreQuery.this.getCustomScoreProvider(info).customExplain(doc,subQueryExpl,valSrcExpls); Explanation customExp = CustomScoreQuery.this.getCustomScoreProvider(info).customExplain(doc,subQueryExpl,valSrcExpls);
float sc = getValue() * customExp.getValue(); float sc = getBoost() * customExp.getValue();
Explanation res = new ComplexExplanation( Explanation res = new ComplexExplanation(
true, sc, CustomScoreQuery.this.toString() + ", product of:"); true, sc, CustomScoreQuery.this.toString() + ", product of:");
res.addDetail(customExp); res.addDetail(customExp);
res.addDetail(new Explanation(getValue(), "queryBoost")); // actually using the q boost as q weight (== weight value) res.addDetail(new Explanation(getBoost(), "queryBoost")); // actually using the q boost as q weight (== weight value)
return res; return res;
} }
@ -294,10 +287,10 @@ public class CustomScoreQuery extends Query {
private float vScores[]; // reused in score() to avoid allocating this array for each doc private float vScores[]; // reused in score() to avoid allocating this array for each doc
// constructor // constructor
private CustomScorer(CustomScoreProvider provider, CustomWeight w, private CustomScorer(CustomScoreProvider provider, CustomWeight w, float qWeight,
Scorer subQueryScorer, Scorer[] valSrcScorers) throws IOException { Scorer subQueryScorer, Scorer[] valSrcScorers) throws IOException {
super(w); super(w);
this.qWeight = w.getValue(); this.qWeight = qWeight;
this.subQueryScorer = subQueryScorer; this.subQueryScorer = subQueryScorer;
this.valSrcScorers = valSrcScorers; this.valSrcScorers = valSrcScorers;
this.vScores = new float[valSrcScorers.length]; this.vScores = new float[valSrcScorers.length];

View File

@ -78,21 +78,16 @@ public class BoostedQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return getBoost(); float sum = qWeight.getValueForNormalization();
}
@Override
public float sumOfSquaredWeights() throws IOException {
float sum = qWeight.sumOfSquaredWeights();
sum *= getBoost() * getBoost(); sum *= getBoost() * getBoost();
return sum ; return sum ;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); topLevelBoost *= getBoost();
qWeight.normalize(norm); qWeight.normalize(norm, topLevelBoost);
} }
@Override @Override
@ -101,7 +96,7 @@ public class BoostedQuery extends Query {
if(subQueryScorer == null) { if(subQueryScorer == null) {
return null; return null;
} }
return new BoostedQuery.CustomScorer(context, this, subQueryScorer, boostVal); return new BoostedQuery.CustomScorer(context, this, getBoost(), subQueryScorer, boostVal);
} }
@Override @Override
@ -128,11 +123,11 @@ public class BoostedQuery extends Query {
private final DocValues vals; private final DocValues vals;
private final AtomicReaderContext readerContext; private final AtomicReaderContext readerContext;
private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, float qWeight,
Scorer scorer, ValueSource vs) throws IOException { Scorer scorer, ValueSource vs) throws IOException {
super(w); super(w);
this.weight = w; this.weight = w;
this.qWeight = w.getValue(); this.qWeight = qWeight;
this.scorer = scorer; this.scorer = scorer;
this.readerContext = readerContext; this.readerContext = readerContext;
this.vals = vs.getValues(weight.fcontext, readerContext); this.vals = vs.getValues(weight.fcontext, readerContext);

View File

@ -77,25 +77,20 @@ public class FunctionQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new AllScorer(context, this); return new AllScorer(context, this, queryWeight);
} }
@Override @Override
@ -114,10 +109,10 @@ public class FunctionQuery extends Query {
final boolean hasDeletions; final boolean hasDeletions;
final Bits liveDocs; final Bits liveDocs;
public AllScorer(AtomicReaderContext context, FunctionWeight w) throws IOException { public AllScorer(AtomicReaderContext context, FunctionWeight w, float qWeight) throws IOException {
super(w); super(w);
this.weight = w; this.weight = w;
this.qWeight = w.getValue(); this.qWeight = qWeight;
this.reader = context.reader; this.reader = context.reader;
this.maxDoc = reader.maxDoc(); this.maxDoc = reader.maxDoc();
this.hasDeletions = reader.hasDeletions(); this.hasDeletions = reader.hasDeletions();

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.queries.function.DocValues;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import java.io.IOException; import java.io.IOException;
@ -42,9 +43,11 @@ public class IDFValueSource extends DocFreqValueSource {
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher"); IndexSearcher searcher = (IndexSearcher)context.get("searcher");
Similarity sim = searcher.getSimilarityProvider().get(field); Similarity sim = searcher.getSimilarityProvider().get(field);
// todo: we need docFreq that takes a BytesRef if (!(sim instanceof TFIDFSimilarity)) {
int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString())); throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)");
float idf = sim.idf(docfreq, searcher.maxDoc()); }
int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes));
float idf = ((TFIDFSimilarity)sim).idf(docfreq, searcher.maxDoc());
return new ConstDoubleDocValues(idf, this); return new ConstDoubleDocValues(idf, this);
} }
} }

View File

@ -23,6 +23,8 @@ import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.queries.function.docvalues.FloatDocValues;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import java.io.IOException; import java.io.IOException;
import java.util.Map; import java.util.Map;
@ -49,7 +51,11 @@ public class NormValueSource extends ValueSource {
@Override @Override
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher"); IndexSearcher searcher = (IndexSearcher)context.get("searcher");
final Similarity similarity = searcher.getSimilarityProvider().get(field); Similarity sim = searcher.getSimilarityProvider().get(field);
if (!(sim instanceof TFIDFSimilarity)) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)");
}
final TFIDFSimilarity similarity = (TFIDFSimilarity) sim;
final byte[] norms = readerContext.reader.norms(field); final byte[] norms = readerContext.reader.norms(field);
if (norms == null) { if (norms == null) {
return new ConstDoubleDocValues(0.0, this); return new ConstDoubleDocValues(0.0, this);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.queries.function.docvalues.FloatDocValues;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import java.io.IOException; import java.io.IOException;
@ -43,7 +44,11 @@ public class TFValueSource extends TermFreqValueSource {
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader.fields(); Fields fields = readerContext.reader.fields();
final Terms terms = fields.terms(field); final Terms terms = fields.terms(field);
final Similarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field); final Similarity sim = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field);
if (!(sim instanceof TFIDFSimilarity)) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)");
}
final TFIDFSimilarity similarity = (TFIDFSimilarity) sim;
return new FloatDocValues(this) { return new FloatDocValues(this) {
DocsEnum docs ; DocsEnum docs ;

View File

@ -354,25 +354,20 @@ class SpatialDistanceQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new SpatialScorer(context, this); return new SpatialScorer(context, this, queryWeight);
} }
@Override @Override
@ -405,10 +400,10 @@ class SpatialDistanceQuery extends Query {
int lastDistDoc; int lastDistDoc;
double lastDist; double lastDist;
public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w) throws IOException { public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w, float qWeight) throws IOException {
super(w); super(w);
this.weight = w; this.weight = w;
this.qWeight = w.getValue(); this.qWeight = qWeight;
this.reader = readerContext.reader; this.reader = readerContext.reader;
this.maxDoc = reader.maxDoc(); this.maxDoc = reader.maxDoc();
this.liveDocs = reader.getLiveDocs(); this.liveDocs = reader.getLiveDocs();

View File

@ -168,19 +168,15 @@ class JoinQuery extends Query {
return JoinQuery.this; return JoinQuery.this;
} }
public float getValue() {
return getBoost();
}
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@ -223,7 +219,7 @@ class JoinQuery extends Query {
DocIdSet readerSet = filter.getDocIdSet(context); DocIdSet readerSet = filter.getDocIdSet(context);
if (readerSet == null) readerSet=DocIdSet.EMPTY_DOCIDSET; if (readerSet == null) readerSet=DocIdSet.EMPTY_DOCIDSET;
return new JoinScorer(this, readerSet.iterator()); return new JoinScorer(this, readerSet.iterator(), getBoost());
} }
@ -514,9 +510,9 @@ class JoinQuery extends Query {
final float score; final float score;
int doc = -1; int doc = -1;
public JoinScorer(Weight w, DocIdSetIterator iter) throws IOException { public JoinScorer(Weight w, DocIdSetIterator iter, float score) throws IOException {
super(w); super(w);
score = w.getValue(); this.score = score;
this.iter = iter==null ? DocIdSet.EMPTY_DOCIDSET.iterator() : iter; this.iter = iter==null ? DocIdSet.EMPTY_DOCIDSET.iterator() : iter;
} }

View File

@ -106,31 +106,26 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new ConstantScorer(context, this); return new ConstantScorer(context, this, queryWeight);
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException { public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
ConstantScorer cs = new ConstantScorer(context, this); ConstantScorer cs = new ConstantScorer(context, this, queryWeight);
boolean exists = cs.docIdSetIterator.advance(doc) == doc; boolean exists = cs.docIdSetIterator.advance(doc) == doc;
ComplexExplanation result = new ComplexExplanation(); ComplexExplanation result = new ComplexExplanation();
@ -157,9 +152,9 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend
final float theScore; final float theScore;
int doc = -1; int doc = -1;
public ConstantScorer(AtomicReaderContext context, ConstantWeight w) throws IOException { public ConstantScorer(AtomicReaderContext context, ConstantWeight w, float theScore) throws IOException {
super(w); super(w);
theScore = w.getValue(); this.theScore = theScore;
DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context); DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context);
if (docIdSet == null) { if (docIdSet == null) {
docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator();

View File

@ -21,7 +21,7 @@ import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
@ -305,7 +305,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
assertQ(req("fl","*,score","q", "{!func}docfreq($field,$value)", "fq","id:6", "field","a_t", "value","cow"), "//float[@name='score']='3.0'"); assertQ(req("fl","*,score","q", "{!func}docfreq($field,$value)", "fq","id:6", "field","a_t", "value","cow"), "//float[@name='score']='3.0'");
assertQ(req("fl","*,score","q", "{!func}termfreq(a_t,cow)", "fq","id:6"), "//float[@name='score']='5.0'"); assertQ(req("fl","*,score","q", "{!func}termfreq(a_t,cow)", "fq","id:6"), "//float[@name='score']='5.0'");
Similarity similarity = new DefaultSimilarity(); TFIDFSimilarity similarity = new DefaultSimilarity();
// make sure it doesn't get a NPE if no terms are present in a field. // make sure it doesn't get a NPE if no terms are present in a field.
assertQ(req("fl","*,score","q", "{!func}termfreq(nofield_t,cow)", "fq","id:6"), "//float[@name='score']='0.0'"); assertQ(req("fl","*,score","q", "{!func}termfreq(nofield_t,cow)", "fq","id:6"), "//float[@name='score']='0.0'");
@ -323,7 +323,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
state.setBoost(1.0f); state.setBoost(1.0f);
state.setLength(4); state.setLength(4);
assertQ(req("fl","*,score","q", "{!func}norm(a_t)", "fq","id:2"), assertQ(req("fl","*,score","q", "{!func}norm(a_t)", "fq","id:2"),
"//float[@name='score']='" + similarity.computeNorm(state) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte "//float[@name='score']='" + similarity.decodeNormValue(similarity.computeNorm(state)) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte
// test that ord and rord are working on a global index basis, not just // test that ord and rord are working on a global index basis, not just
// at the segment level (since Lucene 2.9 has switched to per-segment searching) // at the segment level (since Lucene 2.9 has switched to per-segment searching)