From ea67cd8b2c43274d80999c140c948a6fe1ffe52e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 8 Jul 2011 05:08:05 +0000 Subject: [PATCH] LUCENE-2392: decouple vector space scoring from Query/Weight/Scorer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144158 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 + lucene/MIGRATE.txt | 10 + .../instantiated/InstantiatedIndexWriter.java | 3 +- .../lucene/index/memory/MemoryIndex.java | 8 +- .../lucene/index/FieldNormModifier.java | 2 +- .../lucene/misc/SweetSpotSimilarity.java | 4 +- .../lucene/index/TestFieldNormModifier.java | 4 +- .../lucene/misc/SweetSpotSimilarityTest.java | 35 +- .../lucene/misc/TestLengthNormModifier.java | 8 +- .../lucene/search/FuzzyLikeThisQuery.java | 6 +- .../lucene/search/similar/MoreLikeThis.java | 9 +- .../apache/lucene/document/AbstractField.java | 4 +- .../org/apache/lucene/document/Fieldable.java | 4 +- .../org/apache/lucene/index/IndexReader.java | 5 +- .../lucene/index/NormsWriterPerField.java | 3 +- .../apache/lucene/search/BooleanQuery.java | 13 +- .../search/ConstantScoreAutoRewrite.java | 16 +- .../lucene/search/ConstantScoreQuery.java | 23 +- .../lucene/search/DefaultSimilarity.java | 8 +- .../lucene/search/DisjunctionMaxQuery.java | 14 +- .../lucene/search/ExactPhraseScorer.java | 26 +- .../org/apache/lucene/search/Explanation.java | 21 - .../apache/lucene/search/FilteredQuery.java | 14 +- .../apache/lucene/search/IndexSearcher.java | 6 +- .../lucene/search/MatchAllDocsQuery.java | 44 +- .../lucene/search/MultiPhraseQuery.java | 128 +-- .../apache/lucene/search/MultiTermQuery.java | 6 +- .../org/apache/lucene/search/PhraseQuery.java | 168 ++-- .../apache/lucene/search/PhraseScorer.java | 15 +- .../apache/lucene/search/ScoringRewrite.java | 16 +- .../org/apache/lucene/search/Similarity.java | 843 ++++-------------- .../lucene/search/SloppyPhraseScorer.java | 8 +- .../apache/lucene/search/TFIDFSimilarity.java | 831 +++++++++++++++++ .../lucene/search/TermCollectingRewrite.java | 4 +- .../org/apache/lucene/search/TermQuery.java | 141 +-- .../org/apache/lucene/search/TermScorer.java | 30 +- .../apache/lucene/search/TopTermsRewrite.java | 14 +- .../java/org/apache/lucene/search/Weight.java | 17 +- .../search/payloads/PayloadNearQuery.java | 50 +- .../search/payloads/PayloadTermQuery.java | 64 +- .../spans/SpanMultiTermQueryWrapper.java | 6 +- .../lucene/search/spans/SpanScorer.java | 28 +- .../lucene/search/spans/SpanWeight.java | 119 +-- ...rReaderTermState.java => TermContext.java} | 54 +- .../lucene/search/AssertingIndexSearcher.java | 9 +- .../org/apache/lucene/search/CheckHits.java | 6 +- .../index/TestBackwardsCompatibility.java | 12 +- .../lucene/index/TestDeletionPolicy.java | 7 +- .../lucene/index/TestIndexFileDeleter.java | 2 +- .../apache/lucene/index/TestIndexReader.java | 10 +- .../lucene/index/TestIndexReaderClone.java | 6 +- .../index/TestIndexReaderCloneNorms.java | 12 +- .../index/TestIndexReaderOnDiskFull.java | 2 +- .../lucene/index/TestIndexReaderReopen.java | 8 +- .../lucene/index/TestMaxTermFrequency.java | 4 +- .../org/apache/lucene/index/TestNorms.java | 15 +- .../org/apache/lucene/index/TestOmitTf.java | 20 +- .../lucene/index/TestParallelReader.java | 2 +- .../lucene/search/JustCompileSearch.java | 36 +- .../search/TestDisjunctionMaxQuery.java | 4 +- .../lucene/search/TestDocValuesScoring.java | 203 +++++ .../lucene/search/TestMatchAllDocsQuery.java | 22 - .../lucene/search/TestMultiPhraseQuery.java | 20 +- .../org/apache/lucene/search/TestSetNorm.java | 2 +- .../apache/lucene/search/TestSimilarity.java | 21 +- .../lucene/search/TestSimilarityProvider.java | 12 +- .../search/payloads/TestPayloadNearQuery.java | 21 +- .../search/payloads/TestPayloadTermQuery.java | 4 +- .../search/spans/JustCompileSearchSpans.java | 4 +- .../lucene/search/join/BlockJoinQuery.java | 13 +- .../lucene/queries/CustomScoreQuery.java | 35 +- .../lucene/queries/function/BoostedQuery.java | 21 +- .../queries/function/FunctionQuery.java | 17 +- .../function/valuesource/IDFValueSource.java | 9 +- .../function/valuesource/NormValueSource.java | 8 +- .../function/valuesource/TFValueSource.java | 7 +- .../org/apache/solr/schema/LatLonType.java | 17 +- .../apache/solr/search/JoinQParserPlugin.java | 16 +- .../solr/search/SolrConstantScoreQuery.java | 19 +- .../search/function/TestFunctionQuery.java | 6 +- 80 files changed, 1814 insertions(+), 1656 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java rename lucene/src/java/org/apache/lucene/util/{PerReaderTermState.java => TermContext.java} (73%) create mode 100644 lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d484e5b3c61..815cc0ad288 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -156,6 +156,12 @@ Changes in backwards compatibility policy the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt for more information (Chris Male) +* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you + extended Similarity directly before, you should extend TFIDFSimilarity instead. + Similarity is now a lower-level API to implement other scoring algorithms. + See MIGRATE.txt for more details. + (David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir) + Changes in Runtime Behavior * LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index ffbdef459c5..268ca527fb3 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -382,3 +382,13 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing - o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource - o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource - o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery + +* LUCENE-2392: Enable flexible scoring: + + The existing "Similarity" api is now TFIDFSimilarity, if you were extending + Similarity before, you should likely extend this instead. + + Weight.normalize no longer takes a norm value that incorporates the top-level + boost from outer queries such as BooleanQuery, instead it takes 2 parameters, + the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has + been renamed to Weight.getValueForNormalization(). diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index 26c166e9ce1..8b2635085c7 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -240,8 +240,7 @@ public class InstantiatedIndexWriter implements Closeable { final FieldInvertState invertState = new FieldInvertState(); invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); - final float norm = similarityProvider.get(fieldName).computeNorm(invertState); - normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).encodeNormValue(norm); + normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).computeNorm(invertState); } else { System.currentTimeMillis(); } diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 9e383eb2900..727d47cafa3 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -51,7 +51,6 @@ import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; @@ -1202,19 +1201,18 @@ public class MemoryIndex { int numOverlapTokens = info != null ? info.numOverlapTokens : 0; float boost = info != null ? info.getBoost() : 1.0f; FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost); - float n = fieldSim.computeNorm(invertState); - byte norm = fieldSim.encodeNormValue(n); + byte norm = fieldSim.computeNorm(invertState); norms = new byte[] {norm}; // cache it for future reuse cachedNorms = norms; cachedFieldName = fieldName; cachedSimilarity = sim; - if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); + if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + norm + ":" + numTokens); } return norms; } - + @Override protected void doSetNorm(int doc, String fieldName, byte value) { throw new UnsupportedOperationException(); diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java index d3673243e0d..c17ac02aed7 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java @@ -147,7 +147,7 @@ public class FieldNormModifier { for (int d = 0; d < termCounts.length; d++) { if (liveDocs == null || liveDocs.get(d)) { invertState.setLength(termCounts[d]); - subReader.setNorm(d, field, fieldSim.encodeNormValue(fieldSim.computeNorm(invertState))); + subReader.setNorm(d, field, fieldSim.computeNorm(invertState)); } } } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index c5c454a14bb..f1ac1459532 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -106,7 +106,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity { * discountOverlaps is true by default or true for this * specific field. */ @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { final int numTokens; if (discountOverlaps) @@ -114,7 +114,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity { else numTokens = state.getLength(); - return state.getBoost() * computeLengthNorm(numTokens); + return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens)); } /** diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java index 9af69bba012..d5896a45a27 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java @@ -49,8 +49,8 @@ public class TestFieldNormModifier extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength())); } }; } diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java index f7f33f3748f..0e9732c4a91 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java @@ -21,6 +21,7 @@ package org.apache.lucene.misc; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.FieldInvertState; @@ -58,15 +59,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase { invertState.setLength(i); assertEquals("3,10: spot i="+i, 1.0f, - s.computeNorm(invertState), + ss.decodeNormValue(s.computeNorm(invertState)), 0.0f); } for (int i = 10; i < 1000; i++) { invertState.setLength(i-9); - final float normD = d.computeNorm(invertState); + final byte normD = d.computeNorm(invertState); invertState.setLength(i); - final float normS = s.computeNorm(invertState); + final byte normS = s.computeNorm(invertState); assertEquals("3,10: 10 allTerms = new ArrayList(); + ArrayList allTerms = new ArrayList(); for(final Term[] terms: termArrays) { for (Term term: terms) { - allTerms.add(term); + allTerms.add(TermContext.build(context, term, true)); } } - idfExp = similarity.idfExplain(allTerms, searcher); - idf = idfExp.getIdf(); + stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()])); } @Override public Query getQuery() { return MultiPhraseQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -222,8 +214,7 @@ public class MultiPhraseQuery extends Query { } if (slop == 0) { - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, - reader.norms(field)); + ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context)); if (s.noDocs) { return null; } else { @@ -231,84 +222,29 @@ public class MultiPhraseQuery extends Query { } } else { return new SloppyPhraseScorer(this, postingsFreqs, similarity, - slop, reader.norms(field)); + slop, similarity.sloppyDocScorer(stats, field, context)); } } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + idfExp.explain() +")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+ - "), product of:"); - + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context, ScorerContext.def()); - if (scorer == null) { - return new Explanation(0.0f, "no matching docs"); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - - Explanation tfExplanation = new Explanation(); - int d = scorer.advance(doc); - float phraseFreq; - if (d == doc) { - phraseFreq = scorer.freq(); - } else { - phraseFreq = 0.0f; - } - - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch())); - fieldExpl.setValue(tfExplanation.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index 3c8c267691c..e8e7020975b 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * An abstract {@link Query} that matches documents @@ -154,7 +154,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { + protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) { final TermQuery tq = new TermQuery(term, states); tq.setBoost(boost); topLevel.add(tq, BooleanClause.Occur.SHOULD); @@ -195,7 +195,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { + protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) { final Query q = new ConstantScoreQuery(new TermQuery(term, states)); q.setBoost(boost); topLevel.add(q, BooleanClause.Occur.SHOULD); diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java index 300e63f30e1..470cc6656c9 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java @@ -22,10 +22,16 @@ import java.util.Set; import java.util.ArrayList; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; @@ -171,18 +177,17 @@ public class PhraseQuery extends Query { private class PhraseWeight extends Weight { private final Similarity similarity; - private float value; - private float idf; - private float queryNorm; - private float queryWeight; - private IDFExplanation idfExp; + private final Similarity.Stats stats; + private transient TermContext states[]; public PhraseWeight(IndexSearcher searcher) throws IOException { this.similarity = searcher.getSimilarityProvider().get(field); - - idfExp = similarity.idfExplain(terms, searcher); - idf = idfExp.getIdf(); + final ReaderContext context = searcher.getTopReaderContext(); + states = new TermContext[terms.size()]; + for (int i = 0; i < terms.size(); i++) + states[i] = TermContext.build(context, terms.get(i), true); + stats = similarity.computeStats(searcher, field, getBoost(), states); } @Override @@ -192,19 +197,13 @@ public class PhraseQuery extends Query { public Query getQuery() { return PhraseQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -216,21 +215,26 @@ public class PhraseQuery extends Query { PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); + final TermState state = states[i].get(context.ord); + if (state == null) { /* term doesnt exist in this segment */ + assert termNotInReader(reader, field, t.bytes()) : "no termstate found but term exists in reader"; + return null; + } DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs, t.field(), - t.bytes()); + t.bytes(), + state); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { - if (reader.termDocsEnum(liveDocs, t.field(), t.bytes()) != null) { - // term does exist, but has no positions - throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); - } else { - // term does not exist - return null; - } + assert (reader.termDocsEnum(liveDocs, t.field(), t.bytes(), state) != null) : "termstate found but no term exists in reader"; + // term does exist, but has no positions + throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t); + // get the docFreq without seeking + TermsEnum te = reader.fields().terms(field).getThreadTermsEnum(); + te.seekExact(t.bytes(), state); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t); } // sort by increasing docFreq order @@ -239,8 +243,7 @@ public class PhraseQuery extends Query { } if (slop == 0) { // optimize exact case - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, - reader.norms(field)); + ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context)); if (s.noDocs) { return null; } else { @@ -248,96 +251,35 @@ public class PhraseQuery extends Query { } } else { return - new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, - reader.norms(field)); + new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context)); } } + + private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { + // only called from assert + final Terms terms = reader.terms(field); + return terms == null || terms.docFreq(bytes) == 0; + } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - StringBuilder docFreqs = new StringBuilder(); - StringBuilder query = new StringBuilder(); - query.append('\"'); - docFreqs.append(idfExp.explain()); - for (int i = 0; i < terms.size(); i++) { - if (i != 0) { - query.append(" "); - } - - Term term = terms.get(i); - - query.append(term.text()); - } - query.append('\"'); - - Explanation idfExpl = - new Explanation(idf, "idf(" + field + ":" + docFreqs + ")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - Explanation fieldExpl = new Explanation(); - fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+ - "), product of:"); - + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context, ScorerContext.def()); - if (scorer == null) { - return new Explanation(0.0f, "no matching docs"); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - Explanation tfExplanation = new Explanation(); - int d = scorer.advance(doc); - float phraseFreq; - if (d == doc) { - phraseFreq = scorer.freq(); - } else { - phraseFreq = 0.0f; - } - - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setValue(tfExplanation.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - result.setMatch(tfExplanation.isMatch()); - return result; + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java index da84dbcca42..f50ae07032c 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java @@ -30,9 +30,6 @@ import java.io.IOException; * means a match. */ abstract class PhraseScorer extends Scorer { - protected byte[] norms; - protected float value; - private boolean firstTime = true; private boolean more = true; protected PhraseQueue pq; @@ -40,14 +37,12 @@ abstract class PhraseScorer extends Scorer { private float freq; //phrase frequency in current doc as computed by phraseFreq(). - protected final Similarity similarity; + protected final Similarity.SloppyDocScorer docScorer; PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { + Similarity.SloppyDocScorer docScorer) throws IOException { super(weight); - this.similarity = similarity; - this.norms = norms; - this.value = weight.getValue(); + this.docScorer = docScorer; // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position @@ -107,9 +102,7 @@ abstract class PhraseScorer extends Scorer { @Override public float score() throws IOException { - //System.out.println("scoring " + first.doc); - float raw = similarity.tf(freq) * value; // raw score - return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize + return docScorer.score(first.doc, freq); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java index f9451161a3e..098d8b4a8b7 100644 --- a/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; @@ -56,7 +56,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr @Override protected void addClause(BooleanQuery topLevel, Term term, int docCount, - float boost, PerReaderTermState states) { + float boost, TermContext states) { final TermQuery tq = new TermQuery(term, states); tq.setBoost(boost); topLevel.add(tq, BooleanClause.Occur.SHOULD); @@ -117,7 +117,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr if (size > 0) { final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final float[] boost = col.array.boost; - final PerReaderTermState[] termStates = col.array.termState; + final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); @@ -150,12 +150,12 @@ public abstract class ScoringRewrite extends TermCollectingRewr if (e < 0 ) { // duplicate term: update docFreq final int pos = (-e)-1; - array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq()); + array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; } else { // new entry: we populate the entry initially array.boost[e] = boostAtt.getBoost(); - array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq()); + array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); ScoringRewrite.this.checkMaxClauseCount(terms.size()); } return true; @@ -165,7 +165,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ static final class TermFreqBoostByteStart extends DirectBytesStartArray { float[] boost; - PerReaderTermState[] termState; + TermContext[] termState; public TermFreqBoostByteStart(int initSize) { super(initSize); @@ -175,7 +175,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr public int[] init() { final int[] ord = super.init(); boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; - termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; assert termState.length >= ord.length && boost.length >= ord.length; return ord; } @@ -185,7 +185,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr final int[] ord = super.grow(); boost = ArrayUtil.grow(boost, ord.length); if (termState.length < ord.length) { - PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(termState, 0, tmpTermState, 0, termState.length); termState = tmpTermState; } diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/Similarity.java index e8ae33f6ea1..5a907fcb1be 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/Similarity.java @@ -19,594 +19,111 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.Collection; +import org.apache.lucene.document.IndexDocValuesField; // javadoc import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation.IDFExplanation; -import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.index.IndexReader; // javadoc +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Terms; // javadoc +import org.apache.lucene.search.spans.SpanQuery; // javadoc +import org.apache.lucene.util.SmallFloat; // javadoc +import org.apache.lucene.util.TermContext; /** + * Similarity defines the components of Lucene scoring. + *

* Expert: Scoring API. - * - *

Similarity defines the components of Lucene scoring. - * Overriding computation of these components is a convenient - * way to alter Lucene scoring. - * - *

Suggested reading: - * - * Introduction To Information Retrieval, Chapter 6. - * - *

The following describes how Lucene scoring evolves from - * underlying information retrieval models to (efficient) implementation. - * We first brief on VSM Score, - * then derive from it Lucene's Conceptual Scoring Formula, - * from which, finally, evolves Lucene's Practical Scoring Function - * (the latter is connected directly with Lucene classes and methods). - * - *

Lucene combines - * - * Boolean model (BM) of Information Retrieval - * with - * - * Vector Space Model (VSM) of Information Retrieval - - * documents "approved" by BM are scored by VSM. - * - *

In VSM, documents and queries are represented as - * weighted vectors in a multi-dimensional space, - * where each distinct index term is a dimension, - * and weights are - * Tf-idf values. - * - *

VSM does not require weights to be Tf-idf values, - * but Tf-idf values are believed to produce search results of high quality, - * and so Lucene is using Tf-idf. - * Tf and Idf are described in more detail below, - * but for now, for completion, let's just say that - * for given term t and document (or query) x, - * Tf(t,x) varies with the number of occurrences of term t in x - * (when one increases so does the other) and - * idf(t) similarly varies with the inverse of the - * number of index documents containing term t. - * - *

VSM score of document d for query q is the - * - * Cosine Similarity - * of the weighted query vectors V(q) and V(d): - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - *
- * cosine-similarity(q,d)   =   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
- *
- *
- *
- *
VSM Score
- *
- *
 
- * - * - * Where V(q) · V(d) is the - * dot product - * of the weighted vectors, - * and |V(q)| and |V(d)| are their - * Euclidean norms. - * - *

Note: the above equation can be viewed as the dot product of - * the normalized weighted vectors, in the sense that dividing - * V(q) by its euclidean norm is normalizing it to a unit vector. - * - *

Lucene refines VSM score for both search quality and usability: - *

    - *
  • Normalizing V(d) to the unit vector is known to be problematic in that - * it removes all document length information. - * For some documents removing this info is probably ok, - * e.g. a document made by duplicating a certain paragraph 10 times, - * especially if that paragraph is made of distinct terms. - * But for a document which contains no duplicated paragraphs, - * this might be wrong. - * To avoid this problem, a different document length normalization - * factor is used, which normalizes to a vector equal to or larger - * than the unit vector: doc-len-norm(d). - *
  • - * - *
  • At indexing, users can specify that certain documents are more - * important than others, by assigning a document boost. - * For this, the score of each document is also multiplied by its boost value - * doc-boost(d). - *
  • - * - *
  • Lucene is field based, hence each query term applies to a single - * field, document length normalization is by the length of the certain field, - * and in addition to document boost there are also document fields boosts. - *
  • - * - *
  • The same field can be added to a document during indexing several times, - * and so the boost of that field is the multiplication of the boosts of - * the separate additions (or parts) of that field within the document. - *
  • - * - *
  • At search time users can specify boosts to each query, sub-query, and - * each query term, hence the contribution of a query term to the score of - * a document is multiplied by the boost of that query term query-boost(q). - *
  • - * - *
  • A document may match a multi term query without containing all - * the terms of that query (this is correct for some of the queries), - * and users can further reward documents matching more query terms - * through a coordination factor, which is usually larger when - * more terms are matched: coord-factor(q,d). - *
  • - *
- * - *

Under the simplifying assumption of a single field in the index, - * we get Lucene's Conceptual scoring formula: - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - * - *
- * score(q,d)   =   - * coord-factor(q,d) ·   - * query-boost(q) ·   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)|
- *
- *   ·   doc-len-norm(d) - *   ·   doc-boost(d) - *
- *
- *
- *
Lucene Conceptual Scoring Formula
- *
- *
 
- * - *

The conceptual formula is a simplification in the sense that (1) terms and documents - * are fielded and (2) boosts are usually per query term rather than per query. - * - *

We now describe how Lucene implements this conceptual scoring formula, and - * derive from it Lucene's Practical Scoring Function. - * - *

For efficient score computation some scoring components - * are computed and aggregated in advance: - * - *

    - *
  • Query-boost for the query (actually for each query term) - * is known when search starts. - *
  • - * - *
  • Query Euclidean norm |V(q)| can be computed when search starts, - * as it is independent of the document being scored. - * From search optimization perspective, it is a valid question - * why bother to normalize the query at all, because all - * scored documents will be multiplied by the same |V(q)|, - * and hence documents ranks (their order by score) will not - * be affected by this normalization. - * There are two good reasons to keep this normalization: - *
      - *
    • Recall that - * - * Cosine Similarity can be used find how similar - * two documents are. One can use Lucene for e.g. - * clustering, and use a document as a query to compute - * its similarity to other documents. - * In this use case it is important that the score of document d3 - * for query d1 is comparable to the score of document d3 - * for query d2. In other words, scores of a document for two - * distinct queries should be comparable. - * There are other applications that may require this. - * And this is exactly what normalizing the query vector V(q) - * provides: comparability (to a certain extent) of two or more queries. - *
    • - * - *
    • Applying query normalization on the scores helps to keep the - * scores around the unit vector, hence preventing loss of score data - * because of floating point precision limitations. - *
    • - *
    - *
  • - * - *
  • Document length norm doc-len-norm(d) and document - * boost doc-boost(d) are known at indexing time. - * They are computed in advance and their multiplication - * is saved as a single value in the index: norm(d). - * (In the equations below, norm(t in d) means norm(field(t) in doc d) - * where field(t) is the field associated with term t.) - *
  • - *
- * - *

Lucene's Practical Scoring Function is derived from the above. - * The color codes demonstrate how it relates - * to those of the conceptual formula: - * - *

- * - * - * - *
- * - * - *
- * - * - * - * - * - * - * - * - * - * - * - *
- * score(q,d)   =   - * coord(q,d)  ·  - * queryNorm(q)  ·  - * - * - * - * ( - * tf(t in d)  ·  - * idf(t)2  ·  - * t.getBoost() ·  - * norm(t,d) - * ) - *
t in q
- *
- *
- *
Lucene Practical Scoring Function
- *
- * - *

where + *

+ * This is a low-level API, you should only extend this API if you want to implement + * an information retrieval model. If you are instead looking for a convenient way + * to alter Lucene's scoring, consider extending a higher-level implementation + * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or + * just tweaking the default implementation: {@link DefaultSimilarity}. + *

+ * Similarity determines how Lucene weights terms, and Lucene interacts with + * this class at both index-time and + * query-time. + *

+ * + * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing + * the Similarity implementation to return a per-document byte for the field that will + * be later accessible via {@link IndexReader#norms(String)}. Lucene makes no assumption + * about what is in this byte, but it is most useful for encoding length normalization + * information. + *

+ * Implementations should carefully consider how the normalization byte is encoded: while + * Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost + * and length normalization information with {@link SmallFloat}, this might not be suitable + * for all purposes. + *

+ * Many formulas require the use of average document length, which can be computed via a + * combination of {@link Terms#getSumTotalTermFreq()} and {@link IndexReader#maxDoc()}, + *

+ * Because index-time boost is handled entirely at the application level anyway, + * an application can alternatively store the index-time boost separately using an + * {@link IndexDocValuesField}, and access this at query-time with + * {@link IndexReader#docValues(String)}. + *

+ * Finally, using index-time boosts (either via folding into the normalization byte or + * via IndexDocValues), is an inefficient way to boost the scores of different fields if the + * boost will be the same for every document, instead the Similarity can simply take a constant + * boost parameter C, and the SimilarityProvider can return different instances with + * different boosts depending upon field name. + *

+ * + * At query-time, Queries interact with the Similarity via these steps: *

    - *
  1. - * - * tf(t in d) - * correlates to the term's frequency, - * defined as the number of times term t appears in the currently scored document d. - * Documents that have more occurrences of a given term receive a higher score. - * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, - * However if a query contains twice the same term, there will be - * two term-queries with that same term and hence the computation would still be correct (although - * not very efficient). - * The default computation for tf(t in d) in - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   - * - * frequency½ - *
    - *
     
    - *
  2. - * - *
  3. - * - * idf(t) stands for Inverse Document Frequency. This value - * correlates to the inverse of docFreq - * (the number of documents in which the term t appears). - * This means rarer terms give higher contribution to the total score. - * idf(t) appears for t in both the query and the document, - * hence it is squared in the equation. - * The default computation for idf(t) in - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   - * - * 1 + log ( - * - * - * - * - * - *
    numDocs
    –––––––––
    docFreq+1
    - *
    - * ) - *
    - *
     
    - *
  4. - * - *
  5. - * - * coord(q,d) - * is a score factor based on how many of the query terms are found in the specified document. - * Typically, a document that contains more of the query's terms will receive a higher score - * than another document with fewer query terms. - * This is a search time factor computed in - * {@link SimilarityProvider#coord(int, int) coord(q,d)} - * by the SimilarityProvider in effect at search time. - *
     
    - *
  6. - * - *
  7. - * - * queryNorm(q) - * - * is a normalizing factor used to make scores between queries comparable. - * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), - * but rather just attempts to make scores from different queries (or even different indexes) comparable. - * This is a search time factor computed by the SimilarityProvider in effect at search time. - * - * The default computation in - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} - * produces a Euclidean norm: - *
     
    - * - * - * - * - * - *
    - * queryNorm(q)   =   - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} - *   =   - * - * - * - * - * - *
    1
    - * –––––––––––––– - *
    sumOfSquaredWeights½
    - *
    - *
     
    - * - * The sum of squared weights (of the query terms) is - * computed by the query {@link org.apache.lucene.search.Weight} object. - * For example, a {@link org.apache.lucene.search.BooleanQuery} - * computes this value as: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights}   =   - * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 - *  ·  - * - * - * - * ( - * idf(t)  ·  - * t.getBoost() - * ) 2 - *
    t in q
    - *
     
    - * - *
  8. - * - *
  9. - * - * t.getBoost() - * is a search time boost of term t in the query q as - * specified in the query text - * (see query syntax), - * or as set by application calls to - * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. - * Notice that there is really no direct API for accessing a boost of one term in a multi term query, - * but rather multi terms are represented in a query as multi - * {@link org.apache.lucene.search.TermQuery TermQuery} objects, - * and so the boost of a term in the query is accessible by calling the sub-query - * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. - *
     
    - *
  10. - * - *
  11. - * - * norm(t,d) encapsulates a few (indexing time) boost and length factors: - * - *
      - *
    • Document boost - set by calling - * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} - * before adding the document to the index. - *
    • - *
    • Field boost - set by calling - * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} - * before adding the field to a document. - *
    • - *
    • lengthNorm - computed - * when the document is added to the index in accordance with the number of tokens - * of this field in the document, so that shorter fields contribute more to the score. - * LengthNorm is computed by the Similarity class in effect at indexing. - *
    • - *
    - * The {@link #computeNorm} method is responsible for - * combining all of these factors into a single float. - * - *

    - * When a document is added to the index, all the above factors are multiplied. - * If the document has multiple fields with the same name, all their boosts are multiplied together: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * norm(t,d)   =   - * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} - *  ·  - * lengthNorm - *  ·  - * - * - * - * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() - *
    field f in d named as t
    - *
     
    - * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte - * before being stored. - * At search time, the norm byte value is read from the index - * {@link org.apache.lucene.store.Directory directory} and - * {@link #decodeNormValue(byte) decoded} back to a float norm value. - * This encoding/decoding, while reducing index size, comes with the price of - * precision loss - it is not guaranteed that decode(encode(x)) = x. - * For instance, decode(encode(0.89)) = 0.75. - *
     
    - * Compression of norm values to a single byte saves memory at search time, - * because once a field is referenced at search time, its norms - for - * all documents - are maintained in memory. - *
     
    - * The rationale supporting such lossy compression of norm values is that - * given the difficulty (and inaccuracy) of users to express their true information - * need by a query, only big differences matter. - *
     
    - * Last, note that search time is too late to modify this norm part of scoring, e.g. by - * using a different {@link Similarity} for search. - *
     
    - *

  12. + *
  13. The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time, + * allowing the implementation to compute any statistics (such as IDF, average document length, etc) + * across the entire collection. The {@link TermContext}s passed in are already positioned + * to the terms involved with the raw statistics involved, so a Similarity can freely use any combination + * of term statistics without causing any additional I/O. Lucene makes no assumption about what is + * stored in the returned {@link Similarity.Stats} object. + *
  14. The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()} + * is called for each query leaf node, {@link SimilarityProvider#queryNorm(float)} is called for the top-level + * query, and finally {@link Similarity.Stats#normalize(float, float)} passes down the normalization value + * and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s). + *
  15. For each segment in the index, the Query creates a {@link #exactDocScorer(Stats, String, IndexReader.AtomicReaderContext)} + * (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a + * {@link #sloppyDocScorer(Stats, String, IndexReader.AtomicReaderContext)} (for queries with sloppy frequencies such as + * SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document. *
+ *

+ * + * When {@link IndexSearcher#explain(Query, int)} is called, queries consult the Similarity's DocScorer for an + * explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency + * was computed. * * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) * @see IndexSearcher#setSimilarityProvider(SimilarityProvider) + * @lucene.experimental */ public abstract class Similarity { public static final int NO_DOC_ID_PROVIDED = -1; - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - - /** Decodes a normalization factor stored in an index. - * @see #encodeNormValue(float) - */ - public float decodeNormValue(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 - } - /** * Computes the normalization value for a field, given the accumulated * state of term processing for this field (see {@link FieldInvertState}). * - *

Implementations should calculate a float value based on the field + *

Implementations should calculate a byte value based on the field * state and then return that value. * *

Matches in longer fields are less precise, so implementations of this * method usually return smaller values when state.getLength() is large, * and larger values when state.getLength() is small. * - *

Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * * @lucene.experimental * * @param state current processing state for this field - * @return the calculated float norm + * @return the calculated byte norm */ - public abstract float computeNorm(FieldInvertState state); - - /** Encodes a normalization factor for storage in an index. - * - *

The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - *

The default implementation calls {@link #tf(float)}. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public float tf(int freq) { - return tf((float)freq); - } + public abstract byte computeNorm(FieldInvertState state); /** Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form - * the frequency that is passed to {@link #tf(float)}. + * the frequency to be used in scoring instead of the exact term count. * *

A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually @@ -619,124 +136,6 @@ public abstract class Similarity { */ public abstract float sloppyFreq(int distance); - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public abstract float tf(float freq); - - /** - * Computes a score factor for a simple term and returns an explanation - * for that score factor. - * - *

- * The default implementation uses: - * - *

-   * idf(docFreq, searcher.maxDoc());
-   * 
- * - * Note that {@link IndexSearcher#maxDoc()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link IndexSearcher#docFreq(Term)} is used, and when the latter - * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. - * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute - * - * @param term the term in question - * @param searcher the document collection being searched - * @param docFreq externally computed docFreq for this term - * @return an IDFExplain object that includes both an idf score factor - and an explanation for the term. - * @throws IOException - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException { - final int df = docFreq; - final int max = searcher.maxDoc(); - final float idf = idf(df, max); - return new IDFExplanation() { - @Override - public String explain() { - return "idf(docFreq=" + df + - ", maxDocs=" + max + ")"; - } - @Override - public float getIdf() { - return idf; - }}; - } - - /** - * This method forwards to {@link - * #idfExplain(Term,IndexSearcher,int)} by passing - * searcher.docFreq(term) as the docFreq. - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException { - return idfExplain(term, searcher, searcher.docFreq(term)); - } - - /** - * Computes a score factor for a phrase. - * - *

- * The default implementation sums the idf factor for - * each term in the phrase. - * - * @param terms the terms in the phrase - * @param searcher the document collection being searched - * @return an IDFExplain object that includes both an idf - * score factor for the phrase and an explanation - * for each term. - * @throws IOException - */ - public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - final int max = searcher.maxDoc(); - float idf = 0.0f; - final StringBuilder exp = new StringBuilder(); - for (final Term term : terms ) { - final int df = searcher.docFreq(term); - idf += idf(df, max); - exp.append(" "); - exp.append(term.text()); - exp.append("="); - exp.append(df); - } - final float fIdf = idf; - return new IDFExplanation() { - @Override - public float getIdf() { - return fIdf; - } - @Override - public String explain() { - return exp.toString(); - } - }; - } - - /** Computes a score factor based on a term's document frequency (the number - * of documents which contain the term). This value is multiplied by the - * {@link #tf(int)} factor for each term in the query and these products are - * then summed to form the initial score for a document. - * - *

Terms that occur in fewer documents are better indicators of topic, so - * implementations of this method usually return larger values for rare terms, - * and smaller values for common terms. - * - * @param docFreq the number of documents which contain the term - * @param numDocs the total number of documents in the collection - * @return a score factor based on the term's document frequency - */ - public abstract float idf(int docFreq, int numDocs); - /** * Calculate a scoring factor based on the data in the payload. Overriding implementations * are responsible for interpreting what is in the payload. Lucene makes no assumptions about @@ -758,5 +157,101 @@ public abstract class Similarity { { return 1; } - + + /** + * Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query. + */ + public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException; + + /** + * returns a new {@link Similarity.ExactDocScorer}. + */ + public abstract ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException; + + /** + * returns a new {@link Similarity.SloppyDocScorer}. + */ + public abstract SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException; + + /** + * API for scoring exact queries such as {@link TermQuery} and + * exact {@link PhraseQuery}. + *

+ * Term frequencies are integers (the term or phrase's tf) + */ + public abstract class ExactDocScorer { + /** + * Score a single document + * @param doc document id + * @param freq term frequency + * @return document's score + */ + public abstract float score(int doc, int freq); + + /** + * Explain the score for a single document + * @param doc document id + * @param freq Explanation of how the term frequency was computed + * @return document's score + */ + public Explanation explain(int doc, Explanation freq) { + Explanation result = new Explanation(score(doc, (int)freq.getValue()), + "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:"); + result.addDetail(freq); + return result; + } + } + + /** + * API for scoring "sloppy" queries such as {@link SpanQuery} and + * sloppy {@link PhraseQuery}. + *

+ * Term frequencies are floating point values. + */ + public abstract class SloppyDocScorer { + /** + * Score a single document + * @param doc document id + * @param freq sloppy term frequency + * @return document's score + */ + public abstract float score(int doc, float freq); + + /** + * Explain the score for a single document + * @param doc document id + * @param freq Explanation of how the sloppy term frequency was computed + * @return document's score + */ + public Explanation explain(int doc, Explanation freq) { + Explanation result = new Explanation(score(doc, freq.getValue()), + "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:"); + result.addDetail(freq); + return result; + } + } + + /** Stores the statistics for the indexed collection. This abstract + * implementation is empty; descendants of {@code Similarity} should + * subclass {@code Stats} and define the statistics they require in the + * subclass. Examples include idf, average field length, etc. + */ + public static abstract class Stats { + + /** The value for normalization of contained query clauses (e.g. sum of squared weights). + *

+ * NOTE: a Similarity implementation might not use any query normalization at all, + * its not required. However, if it wants to participate in query normalization, + * it can return a value here. + */ + public abstract float getValueForNormalization(); + + /** Assigns the query normalization factor and boost from parent queries to this. + *

+ * NOTE: a Similarity implementation might not use this normalized value at all, + * its not required. However, its usually a good idea to at least incorporate + * the topLevelBoost (e.g. from an outer BooleanQuery) into its score. + */ + public abstract void normalize(float queryNorm, float topLevelBoost); + } } diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 381518bbe10..5252c5550b4 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -25,11 +25,13 @@ final class SloppyPhraseScorer extends PhraseScorer { private PhrasePositions repeats[]; private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; - + private final Similarity similarity; + SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, - int slop, byte[] norms) { - super(weight, postings, similarity, norms); + int slop, Similarity.SloppyDocScorer docScorer) throws IOException { + super(weight, postings, docScorer); this.slop = slop; + this.similarity = similarity; } /** diff --git a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java b/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java new file mode 100644 index 00000000000..abc8e512064 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java @@ -0,0 +1,831 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.TermContext; +import org.apache.lucene.util.SmallFloat; + + +/** + * Implementation of {@link Similarity} with the Vector Space Model. + *

+ * Expert: Scoring API. + *

TFIDFSimilarity defines the components of Lucene scoring. + * Overriding computation of these components is a convenient + * way to alter Lucene scoring. + * + *

Suggested reading: + * + * Introduction To Information Retrieval, Chapter 6. + * + *

The following describes how Lucene scoring evolves from + * underlying information retrieval models to (efficient) implementation. + * We first brief on VSM Score, + * then derive from it Lucene's Conceptual Scoring Formula, + * from which, finally, evolves Lucene's Practical Scoring Function + * (the latter is connected directly with Lucene classes and methods). + * + *

Lucene combines + * + * Boolean model (BM) of Information Retrieval + * with + * + * Vector Space Model (VSM) of Information Retrieval - + * documents "approved" by BM are scored by VSM. + * + *

In VSM, documents and queries are represented as + * weighted vectors in a multi-dimensional space, + * where each distinct index term is a dimension, + * and weights are + * Tf-idf values. + * + *

VSM does not require weights to be Tf-idf values, + * but Tf-idf values are believed to produce search results of high quality, + * and so Lucene is using Tf-idf. + * Tf and Idf are described in more detail below, + * but for now, for completion, let's just say that + * for given term t and document (or query) x, + * Tf(t,x) varies with the number of occurrences of term t in x + * (when one increases so does the other) and + * idf(t) similarly varies with the inverse of the + * number of index documents containing term t. + * + *

VSM score of document d for query q is the + * + * Cosine Similarity + * of the weighted query vectors V(q) and V(d): + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + *
+ * cosine-similarity(q,d)   =   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
+ *
+ *
+ *
+ *
VSM Score
+ *
+ *
 
+ * + * + * Where V(q) · V(d) is the + * dot product + * of the weighted vectors, + * and |V(q)| and |V(d)| are their + * Euclidean norms. + * + *

Note: the above equation can be viewed as the dot product of + * the normalized weighted vectors, in the sense that dividing + * V(q) by its euclidean norm is normalizing it to a unit vector. + * + *

Lucene refines VSM score for both search quality and usability: + *

    + *
  • Normalizing V(d) to the unit vector is known to be problematic in that + * it removes all document length information. + * For some documents removing this info is probably ok, + * e.g. a document made by duplicating a certain paragraph 10 times, + * especially if that paragraph is made of distinct terms. + * But for a document which contains no duplicated paragraphs, + * this might be wrong. + * To avoid this problem, a different document length normalization + * factor is used, which normalizes to a vector equal to or larger + * than the unit vector: doc-len-norm(d). + *
  • + * + *
  • At indexing, users can specify that certain documents are more + * important than others, by assigning a document boost. + * For this, the score of each document is also multiplied by its boost value + * doc-boost(d). + *
  • + * + *
  • Lucene is field based, hence each query term applies to a single + * field, document length normalization is by the length of the certain field, + * and in addition to document boost there are also document fields boosts. + *
  • + * + *
  • The same field can be added to a document during indexing several times, + * and so the boost of that field is the multiplication of the boosts of + * the separate additions (or parts) of that field within the document. + *
  • + * + *
  • At search time users can specify boosts to each query, sub-query, and + * each query term, hence the contribution of a query term to the score of + * a document is multiplied by the boost of that query term query-boost(q). + *
  • + * + *
  • A document may match a multi term query without containing all + * the terms of that query (this is correct for some of the queries), + * and users can further reward documents matching more query terms + * through a coordination factor, which is usually larger when + * more terms are matched: coord-factor(q,d). + *
  • + *
+ * + *

Under the simplifying assumption of a single field in the index, + * we get Lucene's Conceptual scoring formula: + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord-factor(q,d) ·   + * query-boost(q) ·   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)|
+ *
+ *   ·   doc-len-norm(d) + *   ·   doc-boost(d) + *
+ *
+ *
+ *
Lucene Conceptual Scoring Formula
+ *
+ *
 
+ * + *

The conceptual formula is a simplification in the sense that (1) terms and documents + * are fielded and (2) boosts are usually per query term rather than per query. + * + *

We now describe how Lucene implements this conceptual scoring formula, and + * derive from it Lucene's Practical Scoring Function. + * + *

For efficient score computation some scoring components + * are computed and aggregated in advance: + * + *

    + *
  • Query-boost for the query (actually for each query term) + * is known when search starts. + *
  • + * + *
  • Query Euclidean norm |V(q)| can be computed when search starts, + * as it is independent of the document being scored. + * From search optimization perspective, it is a valid question + * why bother to normalize the query at all, because all + * scored documents will be multiplied by the same |V(q)|, + * and hence documents ranks (their order by score) will not + * be affected by this normalization. + * There are two good reasons to keep this normalization: + *
      + *
    • Recall that + * + * Cosine Similarity can be used find how similar + * two documents are. One can use Lucene for e.g. + * clustering, and use a document as a query to compute + * its similarity to other documents. + * In this use case it is important that the score of document d3 + * for query d1 is comparable to the score of document d3 + * for query d2. In other words, scores of a document for two + * distinct queries should be comparable. + * There are other applications that may require this. + * And this is exactly what normalizing the query vector V(q) + * provides: comparability (to a certain extent) of two or more queries. + *
    • + * + *
    • Applying query normalization on the scores helps to keep the + * scores around the unit vector, hence preventing loss of score data + * because of floating point precision limitations. + *
    • + *
    + *
  • + * + *
  • Document length norm doc-len-norm(d) and document + * boost doc-boost(d) are known at indexing time. + * They are computed in advance and their multiplication + * is saved as a single value in the index: norm(d). + * (In the equations below, norm(t in d) means norm(field(t) in doc d) + * where field(t) is the field associated with term t.) + *
  • + *
+ * + *

Lucene's Practical Scoring Function is derived from the above. + * The color codes demonstrate how it relates + * to those of the conceptual formula: + * + *

+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord(q,d)  ·  + * queryNorm(q)  ·  + * + * + * + * ( + * tf(t in d)  ·  + * idf(t)2  ·  + * t.getBoost() ·  + * norm(t,d) + * ) + *
t in q
+ *
+ *
+ *
Lucene Practical Scoring Function
+ *
+ * + *

where + *

    + *
  1. + * + * tf(t in d) + * correlates to the term's frequency, + * defined as the number of times term t appears in the currently scored document d. + * Documents that have more occurrences of a given term receive a higher score. + * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, + * However if a query contains twice the same term, there will be + * two term-queries with that same term and hence the computation would still be correct (although + * not very efficient). + * The default computation for tf(t in d) in + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   + * + * frequency½ + *
    + *
     
    + *
  2. + * + *
  3. + * + * idf(t) stands for Inverse Document Frequency. This value + * correlates to the inverse of docFreq + * (the number of documents in which the term t appears). + * This means rarer terms give higher contribution to the total score. + * idf(t) appears for t in both the query and the document, + * hence it is squared in the equation. + * The default computation for idf(t) in + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   + * + * 1 + log ( + * + * + * + * + * + *
    numDocs
    –––––––––
    docFreq+1
    + *
    + * ) + *
    + *
     
    + *
  4. + * + *
  5. + * + * coord(q,d) + * is a score factor based on how many of the query terms are found in the specified document. + * Typically, a document that contains more of the query's terms will receive a higher score + * than another document with fewer query terms. + * This is a search time factor computed in + * {@link SimilarityProvider#coord(int, int) coord(q,d)} + * by the SimilarityProvider in effect at search time. + *
     
    + *
  6. + * + *
  7. + * + * queryNorm(q) + * + * is a normalizing factor used to make scores between queries comparable. + * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), + * but rather just attempts to make scores from different queries (or even different indexes) comparable. + * This is a search time factor computed by the Similarity in effect at search time. + * + * The default computation in + * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} + * produces a Euclidean norm: + *
     
    + * + * + * + * + * + *
    + * queryNorm(q)   =   + * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} + *   =   + * + * + * + * + * + *
    1
    + * –––––––––––––– + *
    sumOfSquaredWeights½
    + *
    + *
     
    + * + * The sum of squared weights (of the query terms) is + * computed by the query {@link org.apache.lucene.search.Weight} object. + * For example, a {@link org.apache.lucene.search.BooleanQuery} + * computes this value as: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights}   =   + * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 + *  ·  + * + * + * + * ( + * idf(t)  ·  + * t.getBoost() + * ) 2 + *
    t in q
    + *
     
    + * + *
  8. + * + *
  9. + * + * t.getBoost() + * is a search time boost of term t in the query q as + * specified in the query text + * (see query syntax), + * or as set by application calls to + * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. + * Notice that there is really no direct API for accessing a boost of one term in a multi term query, + * but rather multi terms are represented in a query as multi + * {@link org.apache.lucene.search.TermQuery TermQuery} objects, + * and so the boost of a term in the query is accessible by calling the sub-query + * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. + *
     
    + *
  10. + * + *
  11. + * + * norm(t,d) encapsulates a few (indexing time) boost and length factors: + * + *
      + *
    • Document boost - set by calling + * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} + * before adding the document to the index. + *
    • + *
    • Field boost - set by calling + * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} + * before adding the field to a document. + *
    • + *
    • lengthNorm - computed + * when the document is added to the index in accordance with the number of tokens + * of this field in the document, so that shorter fields contribute more to the score. + * LengthNorm is computed by the Similarity class in effect at indexing. + *
    • + *
    + * The {@link #computeNorm} method is responsible for + * combining all of these factors into a single float. + * + *

    + * When a document is added to the index, all the above factors are multiplied. + * If the document has multiple fields with the same name, all their boosts are multiplied together: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * norm(t,d)   =   + * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} + *  ·  + * lengthNorm + *  ·  + * + * + * + * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() + *
    field f in d named as t
    + *
     
    + * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte + * before being stored. + * At search time, the norm byte value is read from the index + * {@link org.apache.lucene.store.Directory directory} and + * {@link #decodeNormValue(byte) decoded} back to a float norm value. + * This encoding/decoding, while reducing index size, comes with the price of + * precision loss - it is not guaranteed that decode(encode(x)) = x. + * For instance, decode(encode(0.89)) = 0.75. + *
     
    + * Compression of norm values to a single byte saves memory at search time, + * because once a field is referenced at search time, its norms - for + * all documents - are maintained in memory. + *
     
    + * The rationale supporting such lossy compression of norm values is that + * given the difficulty (and inaccuracy) of users to express their true information + * need by a query, only big differences matter. + *
     
    + * Last, note that search time is too late to modify this norm part of scoring, e.g. by + * using a different {@link Similarity} for search. + *
     
    + *

  12. + *
+ * + * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) + * @see IndexSearcher#setSimilarityProvider(SimilarityProvider) + */ +public abstract class TFIDFSimilarity extends Similarity { + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + *

The default implementation calls {@link #tf(float)}. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public float tf(int freq) { + return tf((float)freq); + } + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public abstract float tf(float freq); + + /** + * Computes a score factor for a simple term and returns an explanation + * for that score factor. + * + *

+ * The default implementation uses: + * + *

+   * idf(docFreq, searcher.maxDoc());
+   * 
+ * + * Note that {@link IndexSearcher#maxDoc()} is used instead of + * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also + * {@link IndexSearcher#docFreq(Term)} is used, and when the latter + * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. + * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute + * + * @param stats statistics of the term in question + * @param searcher the document collection being searched + * @return an Explain object that includes both an idf score factor + and an explanation for the term. + * @throws IOException + */ + public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException { + final int df = stats.docFreq(); + final int max = searcher.maxDoc(); + final float idf = idf(df, max); + return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); + } + + /** + * Computes a score factor for a phrase. + * + *

+ * The default implementation sums the idf factor for + * each term in the phrase. + * + * @param stats statistics of the terms in the phrase + * @param searcher the document collection being searched + * @return an Explain object that includes both an idf + * score factor for the phrase and an explanation + * for each term. + * @throws IOException + */ + public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException { + final int max = searcher.maxDoc(); + float idf = 0.0f; + final Explanation exp = new Explanation(); + exp.setDescription("idf(), sum of:"); + for (final TermContext stat : stats ) { + final int df = stat.docFreq(); + final float termIdf = idf(df, max); + exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); + idf += termIdf; + } + exp.setValue(idf); + return exp; + } + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + *

Terms that occur in fewer documents are better indicators of topic, so + * implementations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + public abstract float idf(int docFreq, int numDocs); + + /** Cache of decoded bytes. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); + } + + /** Decodes a normalization factor stored in an index. + * @see #encodeNormValue(float) + */ + public float decodeNormValue(byte b) { + return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + /** Encodes a normalization factor for storage in an index. + * + *

The encoding uses a three-bit mantissa, a five-bit exponent, and + * the zero-exponent point at 15, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * @see org.apache.lucene.document.Field#setBoost(float) + * @see org.apache.lucene.util.SmallFloat + */ + public byte encodeNormValue(float f) { + return SmallFloat.floatToByte315(f); + } + + @Override + public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, + TermContext... termContexts) throws IOException { + final Explanation idf = termContexts.length == 1 + ? idfExplain(termContexts[0], searcher) + : idfExplain(termContexts, searcher); + return new IDFStats(idf, queryBoost); + } + + @Override + public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + return new ExactTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName)); + } + + @Override + public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + return new SloppyTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName)); + } + + // TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot. + + private final class ExactTFIDFDocScorer extends ExactDocScorer { + private final IDFStats stats; + private final float weightValue; + private final byte[] norms; + private static final int SCORE_CACHE_SIZE = 32; + private float[] scoreCache = new float[SCORE_CACHE_SIZE]; + + ExactTFIDFDocScorer(IDFStats stats, byte norms[]) { + this.stats = stats; + this.weightValue = stats.value; + this.norms = norms; + for (int i = 0; i < SCORE_CACHE_SIZE; i++) + scoreCache[i] = tf(i) * weightValue; + } + + @Override + public float score(int doc, int freq) { + final float raw = // compute tf(f)*weight + freq < SCORE_CACHE_SIZE // check cache + ? scoreCache[freq] // cache hit + : tf(freq)*weightValue; // cache miss + + return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + } + + private final class SloppyTFIDFDocScorer extends SloppyDocScorer { + private final IDFStats stats; + private final float weightValue; + private final byte[] norms; + + SloppyTFIDFDocScorer(IDFStats stats, byte norms[]) { + this.stats = stats; + this.weightValue = stats.value; + this.norms = norms; + } + + @Override + public float score(int doc, float freq) { + final float raw = tf(freq) * weightValue; // compute tf(f)*weight + + return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + } + + /** Collection statistics for the TF-IDF model. The only statistic of interest + * to this model is idf. */ + private static class IDFStats extends Stats { + /** The idf and its explanation */ + private final Explanation idf; + private float queryNorm; + private float queryWeight; + private final float queryBoost; + private float value; + + public IDFStats(Explanation idf, float queryBoost) { + // TODO: Validate? + this.idf = idf; + this.queryBoost = queryBoost; + this.queryWeight = idf.getValue() * queryBoost; // compute query weight + } + + @Override + public float getValueForNormalization() { + // TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses? + return queryWeight * queryWeight; // sum of squared weights + } + + @Override + public void normalize(float queryNorm, float topLevelBoost) { + this.queryNorm = queryNorm * topLevelBoost; + queryWeight *= this.queryNorm; // normalize query weight + value = queryWeight * idf.getValue(); // idf for document + } + } + + private Explanation explainScore(int doc, Explanation freq, IDFStats stats, byte[] norms) { + Explanation result = new Explanation(); + result.setDescription("score(doc="+doc+",freq="+freq+"), product of:"); + + // explain query weight + Explanation queryExpl = new Explanation(); + queryExpl.setDescription("queryWeight, product of:"); + + Explanation boostExpl = new Explanation(stats.queryBoost, "boost"); + if (stats.queryBoost != 1.0f) + queryExpl.addDetail(boostExpl); + queryExpl.addDetail(stats.idf); + + Explanation queryNormExpl = new Explanation(stats.queryNorm,"queryNorm"); + queryExpl.addDetail(queryNormExpl); + + queryExpl.setValue(boostExpl.getValue() * + stats.idf.getValue() * + queryNormExpl.getValue()); + + result.addDetail(queryExpl); + + // explain field weight + Explanation fieldExpl = new Explanation(); + fieldExpl.setDescription("fieldWeight in "+doc+ + ", product of:"); + + Explanation tfExplanation = new Explanation(); + tfExplanation.setValue(tf(freq.getValue())); + tfExplanation.setDescription("tf(freq="+freq.getValue()+"), with freq of:"); + tfExplanation.addDetail(freq); + fieldExpl.addDetail(tfExplanation); + fieldExpl.addDetail(stats.idf); + + Explanation fieldNormExpl = new Explanation(); + float fieldNorm = + norms!=null ? decodeNormValue(norms[doc]) : 1.0f; + fieldNormExpl.setValue(fieldNorm); + fieldNormExpl.setDescription("fieldNorm(doc="+doc+")"); + fieldExpl.addDetail(fieldNormExpl); + + fieldExpl.setValue(tfExplanation.getValue() * + stats.idf.getValue() * + fieldNormExpl.getValue()); + + result.addDetail(fieldExpl); + + // combine them + result.setValue(queryExpl.getValue() * fieldExpl.getValue()); + + if (queryExpl.getValue() == 1.0f) + return fieldExpl; + + return result; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java index 501831728d3..192dd434be7 100644 --- a/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ReaderUtil; abstract class TermCollectingRewrite extends MultiTermQuery.RewriteMethod { @@ -43,7 +43,7 @@ abstract class TermCollectingRewrite extends MultiTermQuery.Rew addClause(topLevel, term, docCount, boost, null); } - protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException; + protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, TermContext states) throws IOException; protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index 078d02f7089..936b0bf4581 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -27,9 +27,9 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.search.Similarity.ExactDocScorer; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ToStringUtils; @@ -39,28 +39,19 @@ import org.apache.lucene.util.ToStringUtils; public class TermQuery extends Query { private final Term term; private int docFreq; - private transient PerReaderTermState perReaderTermState; + private transient TermContext perReaderTermState; private class TermWeight extends Weight { private final Similarity similarity; - private float value; - private final float idf; - private float queryNorm; - private float queryWeight; - private final IDFExplanation idfExp; - private transient PerReaderTermState termStates; + private final Similarity.Stats stats; + private transient TermContext termStates; - public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq) + public TermWeight(IndexSearcher searcher, TermContext termStates) throws IOException { - assert termStates != null : "PerReaderTermState must not be null"; + assert termStates != null : "TermContext must not be null"; this.termStates = termStates; this.similarity = searcher.getSimilarityProvider().get(term.field()); - if (docFreq != -1) { - idfExp = similarity.idfExplain(term, searcher, docFreq); - } else { - idfExp = similarity.idfExplain(term, searcher); - } - idf = idfExp.getIdf(); + this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates); } @Override @@ -70,19 +61,13 @@ public class TermQuery extends Query { public Query getQuery() { return TermQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -97,7 +82,7 @@ public class TermQuery extends Query { } final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state); assert docs != null; - return new TermScorer(this, docs, similarity, context.reader.norms(field)); + return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context)); } private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { @@ -107,79 +92,25 @@ public class TermQuery extends Query { } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - final IndexReader reader = context.reader; - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - Explanation expl = new Explanation(idf, idfExp.explain()); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(expl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - expl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - String field = term.field(); - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+term+" in "+doc+ - "), product of:"); - - Explanation tfExplanation = new Explanation(); - int tf = 0; + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + IndexReader reader = context.reader; DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes()); if (docs != null) { - int newDoc = docs.advance(doc); - if (newDoc == doc) { - tf = docs.freq(); - } - tfExplanation.setValue(similarity.tf(tf)); - tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); - } else { - tfExplanation.setValue(0.0f); - tfExplanation.setDescription("no matching term"); + int newDoc = docs.advance(doc); + if (newDoc == doc) { + int freq = docs.freq(); + ExactDocScorer docScorer = similarity.exactDocScorer(stats, term.field(), context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(expl); - - Explanation fieldNormExpl = new Explanation(); - final byte[] fieldNorms = reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch())); - fieldExpl.setValue(tfExplanation.getValue() * - expl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + return new ComplexExplanation(false, 0.0f, "no matching term"); } } @@ -200,7 +131,7 @@ public class TermQuery extends Query { /** Expert: constructs a TermQuery that will use the * provided docFreq instead of looking up the docFreq * against the searcher. */ - public TermQuery(Term t, PerReaderTermState states) { + public TermQuery(Term t, TermContext states) { assert states != null; term = t; docFreq = states.docFreq(); @@ -213,20 +144,20 @@ public class TermQuery extends Query { @Override public Weight createWeight(IndexSearcher searcher) throws IOException { final ReaderContext context = searcher.getTopReaderContext(); - final int weightDocFreq; - final PerReaderTermState termState; + final TermContext termState; if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { // make TermQuery single-pass if we don't have a PRTS or if the context differs! - termState = PerReaderTermState.build(context, term, true); // cache term lookups! - // we must not ignore the given docFreq - if set use the given value - weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq; + termState = TermContext.build(context, term, true); // cache term lookups! } else { // PRTS was pre-build for this IS termState = this.perReaderTermState; - weightDocFreq = docFreq; } + + // we must not ignore the given docFreq - if set use the given value (lie) + if (docFreq != -1) + termState.setDocFreq(docFreq); - return new TermWeight(searcher, termState, weightDocFreq); + return new TermWeight(searcher, termState); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/TermScorer.java b/lucene/src/java/org/apache/lucene/search/TermScorer.java index 9a9ef5eeb3c..3534079fb34 100644 --- a/lucene/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/TermScorer.java @@ -25,20 +25,16 @@ import org.apache.lucene.index.DocsEnum; */ final class TermScorer extends Scorer { private DocsEnum docsEnum; - private byte[] norms; - private float weightValue; private int doc = -1; private int freq; private int pointer; private int pointerMax; - private static final int SCORE_CACHE_SIZE = 32; - private float[] scoreCache = new float[SCORE_CACHE_SIZE]; private int[] docs; private int[] freqs; private final DocsEnum.BulkReadResult bulkResult; - private final Similarity similarity; + private final Similarity.ExactDocScorer docScorer; /** * Construct a TermScorer. @@ -47,22 +43,15 @@ final class TermScorer extends Scorer { * The weight of the Term in the query. * @param td * An iterator over the documents matching the Term. - * @param similarity - * The Similarity implementation to be used for score - * computations. - * @param norms - * The field norms of the document fields for the Term. + * @param docScorer + * The Similarity.ExactDocScorer implementation + * to be used for score computations. */ - TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { + TermScorer(Weight weight, DocsEnum td, Similarity.ExactDocScorer docScorer) throws IOException { super(weight); - this.similarity = similarity; + this.docScorer = docScorer; this.docsEnum = td; - this.norms = norms; - this.weightValue = weight.getValue(); bulkResult = td.getBulkResult(); - - for (int i = 0; i < SCORE_CACHE_SIZE; i++) - scoreCache[i] = similarity.tf(i) * weightValue; } @Override @@ -134,12 +123,7 @@ final class TermScorer extends Scorer { @Override public float score() { assert doc != NO_MORE_DOCS; - float raw = // compute tf(f)*weight - freq < SCORE_CACHE_SIZE // check cache - ? scoreCache[freq] // cache hit - : similarity.tf(freq)*weightValue; // cache miss - - return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field + return docScorer.score(doc, freq); } /** diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java index 5b322a87910..4ad6222b801 100644 --- a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * Base rewrite method for collecting only the top terms @@ -80,7 +80,7 @@ public abstract class TopTermsRewrite extends TermCollectingRew this.termComp = termsEnum.getComparator(); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) - st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext)); + st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } @@ -101,14 +101,14 @@ public abstract class TopTermsRewrite extends TermCollectingRew if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; - t.termState.register(state, readerContext.ord, termsEnum.docFreq()); + t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; - st.termState.register(state, readerContext.ord, termsEnum.docFreq()); + st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { @@ -116,7 +116,7 @@ public abstract class TopTermsRewrite extends TermCollectingRew visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { - st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext)); + st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize @@ -171,8 +171,8 @@ public abstract class TopTermsRewrite extends TermCollectingRew public final Comparator termComp; public final BytesRef bytes = new BytesRef(); public float boost; - public final PerReaderTermState termState; - public ScoreTerm(Comparator termComp, PerReaderTermState termState) { + public final TermContext termState; + public ScoreTerm(Comparator termComp, TermContext termState) { this.termComp = termComp; this.termState = termState; } diff --git a/lucene/src/java/org/apache/lucene/search/Weight.java b/lucene/src/java/org/apache/lucene/search/Weight.java index 3fb892714c6..e99c5a6b5cb 100644 --- a/lucene/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/src/java/org/apache/lucene/search/Weight.java @@ -41,11 +41,11 @@ import org.apache.lucene.index.IndexReader.ReaderContext; *

    *
  1. A Weight is constructed by a top-level query, given a * IndexSearcher ({@link Query#createWeight(IndexSearcher)}). - *
  2. The {@link #sumOfSquaredWeights()} method is called on the + *
  3. The {@link #getValueForNormalization()} method is called on the * Weight to compute the query normalization factor * {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the * query. - *
  4. The query normalization factor is passed to {@link #normalize(float)}. At + *
  5. The query normalization factor is passed to {@link #normalize(float, float)}. At * this point the weighting is complete. *
  6. A Scorer is constructed by * {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}. @@ -67,12 +67,12 @@ public abstract class Weight { /** The query that this concerns. */ public abstract Query getQuery(); + + /** The value for normalization of contained query clauses (e.g. sum of squared weights). */ + public abstract float getValueForNormalization() throws IOException; - /** The weight for this query. */ - public abstract float getValue(); - - /** Assigns the query normalization factor to this. */ - public abstract void normalize(float norm); + /** Assigns the query normalization factor and boost from parent queries to this. */ + public abstract void normalize(float norm, float topLevelBoost); /** * Returns a {@link Scorer} which scores documents in/out-of order according @@ -93,9 +93,6 @@ public abstract class Weight { * @throws IOException */ public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException; - - /** The sum of squared weights of contained query clauses. */ - public abstract float sumOfSquaredWeights() throws IOException; /** * Returns true iff this implementation scores docs only out of order. This diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java index da91ef59f9d..ac2f5008cd3 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java @@ -18,11 +18,13 @@ package org.apache.lucene.search.payloads; */ import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Similarity.SloppyDocScorer; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.SpanNearQuery; @@ -145,7 +147,35 @@ public class PayloadNearQuery extends SpanNearQuery { @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { return new PayloadNearSpanScorer(query.getSpans(context), this, - similarity, context.reader.norms(query.getField())); + similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + Explanation expl = new Explanation(); + expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + expl.addDetail(scoreExplanation); + expl.setValue(scoreExplanation.getValue()); + // now the payloads part + Explanation payloadExpl = function.explain(doc, scorer.payloadsSeen, scorer.payloadScore); + // combined + ComplexExplanation result = new ComplexExplanation(); + result.addDetail(expl); + result.addDetail(payloadExpl); + result.setValue(expl.getValue() * payloadExpl.getValue()); + result.setDescription("PayloadNearQuery, product of:"); + return result; + } + } + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } @@ -155,8 +185,8 @@ public class PayloadNearQuery extends SpanNearQuery { private int payloadsSeen; protected PayloadNearSpanScorer(Spans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); this.spans = spans; } @@ -225,20 +255,6 @@ public class PayloadNearQuery extends SpanNearQuery { return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore); } - - @Override - protected Explanation explain(int doc) throws IOException { - Explanation result = new Explanation(); - // Add detail about tf/idf... - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - // Add detail about payload - Explanation payloadExpl = function.explain(doc, payloadsSeen, payloadScore); - result.addDetail(payloadExpl); - result.setValue(nonPayloadExpl.getValue() * payloadExpl.getValue()); - result.setDescription("PayloadNearQuery, product of:"); - return result; - } } } diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java index b3415a7b42c..9c697de1474 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java @@ -26,6 +26,9 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.ComplexExplanation; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; @@ -76,7 +79,7 @@ public class PayloadTermQuery extends SpanTermQuery { @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { return new PayloadTermSpanScorer((TermSpans) query.getSpans(context), - this, similarity, context.reader.norms(query.getField())); + this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); } protected class PayloadTermSpanScorer extends SpanScorer { @@ -86,8 +89,8 @@ public class PayloadTermQuery extends SpanTermQuery { private final TermSpans termSpans; public PayloadTermSpanScorer(TermSpans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); termSpans = spans; } @@ -173,29 +176,40 @@ public class PayloadTermQuery extends SpanTermQuery { protected float getPayloadScore() { return function.docScore(doc, term.field(), payloadsSeen, payloadScore); } - - @Override - protected Explanation explain(final int doc) throws IOException { - ComplexExplanation result = new ComplexExplanation(); - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - // QUESTION: Is there a way to avoid this skipTo call? We need to know - // whether to load the payload or not - Explanation payloadBoost = new Explanation(); - result.addDetail(payloadBoost); - - float payloadScore = getPayloadScore(); - payloadBoost.setValue(payloadScore); - // GSI: I suppose we could toString the payload, but I don't think that - // would be a good idea - payloadBoost.setDescription("scorePayload(...)"); - result.setValue(nonPayloadExpl.getValue() * payloadScore); - result.setDescription("btq, product of:"); - result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE - : Boolean.TRUE); // LUCENE-1303 - return result; + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + Explanation expl = new Explanation(); + expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + expl.addDetail(scoreExplanation); + expl.setValue(scoreExplanation.getValue()); + // now the payloads part + // QUESTION: Is there a way to avoid this skipTo call? We need to know + // whether to load the payload or not + // GSI: I suppose we could toString the payload, but I don't think that + // would be a good idea + Explanation payloadExpl = new Explanation(scorer.getPayloadScore(), "scorePayload(...)"); + payloadExpl.setValue(scorer.getPayloadScore()); + // combined + ComplexExplanation result = new ComplexExplanation(); + result.addDetail(expl); + result.addDetail(payloadExpl); + result.setValue(expl.getValue() * payloadExpl.getValue()); + result.setDescription("btq, product of:"); + result.setMatch(expl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303 + return result; + } } - + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java index 865e2b1eb46..a393b38b5df 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TopTermsRewrite; import org.apache.lucene.search.ScoringRewrite; import org.apache.lucene.search.BooleanClause.Occur; // javadocs only -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * Wraps any {@link MultiTermQuery} as a {@link SpanQuery}, @@ -155,7 +155,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { + protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) { final SpanTermQuery q = new SpanTermQuery(term); q.setBoost(boost); topLevel.addClause(q); @@ -204,7 +204,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { + protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, TermContext states) { final SpanTermQuery q = new SpanTermQuery(term); q.setBoost(boost); topLevel.addClause(q); diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java index 8b309a3df68..9cce1f45e4b 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.spans; import java.io.IOException; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; @@ -29,22 +30,21 @@ import org.apache.lucene.search.Similarity; */ public class SpanScorer extends Scorer { protected Spans spans; - protected byte[] norms; - protected float value; protected boolean more = true; protected int doc; protected float freq; protected final Similarity similarity; + protected final Similarity.SloppyDocScorer docScorer; - protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) + protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { super(weight); this.similarity = similarity; + this.docScorer = docScorer; this.spans = spans; - this.norms = norms; - this.value = weight.getValue(); + if (this.spans.next()) { doc = -1; } else { @@ -94,27 +94,11 @@ public class SpanScorer extends Scorer { @Override public float score() throws IOException { - float raw = similarity.tf(freq) * value; // raw score - return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize + return docScorer.score(doc, freq); } @Override public float freq() throws IOException { return freq; } - - /** This method is no longer an official member of {@link Scorer}, - * but it is needed by SpanWeight to build an explanation. */ - protected Explanation explain(final int doc) throws IOException { - Explanation tfExplanation = new Explanation(); - - int expDoc = advance(doc); - - float phraseFreq = (expDoc == doc) ? freq : 0.0f; - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - - return tfExplanation; - } - } diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java index 104bacf0a37..cf8bf4e22a2 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,125 +18,76 @@ package org.apache.lucene.search.spans; */ import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.util.TermContext; import java.io.IOException; -import java.util.HashSet; import java.util.Set; +import java.util.TreeSet; /** * Expert-only. Public for use by other weight implementations */ public class SpanWeight extends Weight { protected Similarity similarity; - protected float value; - protected float idf; - protected float queryNorm; - protected float queryWeight; - protected Set terms; protected SpanQuery query; - private IDFExplanation idfExp; + protected Similarity.Stats stats; public SpanWeight(SpanQuery query, IndexSearcher searcher) throws IOException { this.similarity = searcher.getSimilarityProvider().get(query.getField()); this.query = query; - terms=new HashSet(); + terms=new TreeSet(); query.extractTerms(terms); - - idfExp = similarity.idfExplain(terms, searcher); - idf = idfExp.getIdf(); + final ReaderContext context = searcher.getTopReaderContext(); + final TermContext states[] = new TermContext[terms.size()]; + int i = 0; + for (Term term : terms) + states[i++] = TermContext.build(context, term, true); + stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states); } @Override public Query getQuery() { return query; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() throws IOException { - queryWeight = idf * query.getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() throws IOException { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new SpanScorer(query.getSpans(context), this, similarity, context.reader - .norms(query.getField())); + return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - String field = ((SpanQuery)getQuery()).getField(); - - Explanation idfExpl = - new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost"); - if (getQuery().getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+ - " in "+doc+"), product of:"); - - Explanation tfExpl = ((SpanScorer)scorer(context, ScorerContext.def())).explain(doc); - fieldExpl.addDetail(tfExpl); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); - fieldExpl.setValue(tfExpl.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + Scorer scorer = scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } + } + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/util/PerReaderTermState.java b/lucene/src/java/org/apache/lucene/util/TermContext.java similarity index 73% rename from lucene/src/java/org/apache/lucene/util/PerReaderTermState.java rename to lucene/src/java/org/apache/lucene/util/TermContext.java index a5139b6335e..746405c353d 100644 --- a/lucene/src/java/org/apache/lucene/util/PerReaderTermState.java +++ b/lucene/src/java/org/apache/lucene/util/TermContext.java @@ -28,25 +28,27 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.index.TermsEnum.SeekStatus; /** * Maintains a {@link IndexReader} {@link TermState} view over * {@link IndexReader} instances containing a single term. The - * {@link PerReaderTermState} doesn't track if the given {@link TermState} + * {@link TermContext} doesn't track if the given {@link TermState} * objects are valid, neither if the {@link TermState} instances refer to the * same terms in the associated readers. * * @lucene.experimental */ -public final class PerReaderTermState { +public final class TermContext { public final ReaderContext topReaderContext; // for asserting! private final TermState[] states; private int docFreq; + private long totalTermFreq; /** - * Creates an empty {@link PerReaderTermState} from a {@link ReaderContext} + * Creates an empty {@link TermContext} from a {@link ReaderContext} */ - public PerReaderTermState(ReaderContext context) { + public TermContext(ReaderContext context) { assert context != null && context.isTopLevel; topReaderContext = context; docFreq = 0; @@ -60,28 +62,28 @@ public final class PerReaderTermState { } /** - * Creates a {@link PerReaderTermState} with an initial {@link TermState}, + * Creates a {@link TermContext} with an initial {@link TermState}, * {@link IndexReader} pair. */ - public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) { + public TermContext(ReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) { this(context); - register(state, ord, docFreq); + register(state, ord, docFreq, totalTermFreq); } /** - * Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the + * Creates a {@link TermContext} from a top-level {@link ReaderContext} and the * given {@link Term}. This method will lookup the given term in all context's leaf readers - * and register each of the readers containing the term in the returned {@link PerReaderTermState} + * and register each of the readers containing the term in the returned {@link TermContext} * using the leaf reader's ordinal. *

    * Note: the given context must be a top-level context. */ - public static PerReaderTermState build(ReaderContext context, Term term, boolean cache) + public static TermContext build(ReaderContext context, Term term, boolean cache) throws IOException { assert context != null && context.isTopLevel; final String field = term.field(); final BytesRef bytes = term.bytes(); - final PerReaderTermState perReaderTermState = new PerReaderTermState(context); + final TermContext perReaderTermState = new TermContext(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); for (int i = 0; i < leaves.length; i++) { final Fields fields = leaves[i].reader.fields(); @@ -91,7 +93,7 @@ public final class PerReaderTermState { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! if (termsEnum.seekExact(bytes, cache)) { final TermState termState = termsEnum.termState(); - perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); + perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } @@ -100,7 +102,7 @@ public final class PerReaderTermState { } /** - * Clears the {@link PerReaderTermState} internal state and removes all + * Clears the {@link TermContext} internal state and removes all * registered {@link TermState}s */ public void clear() { @@ -112,12 +114,16 @@ public final class PerReaderTermState { * Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal * should be derived from a {@link ReaderContext}'s leaf ord. */ - public void register(TermState state, final int ord, final int docFreq) { + public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) { assert state != null : "state must not be null"; assert ord >= 0 && ord < states.length; assert states[ord] == null : "state for ord: " + ord + " already registered"; this.docFreq += docFreq; + if (this.totalTermFreq >= 0 && totalTermFreq >= 0) + this.totalTermFreq += totalTermFreq; + else + this.totalTermFreq = -1; states[ord] = state; } @@ -137,11 +143,27 @@ public final class PerReaderTermState { /** * Returns the accumulated document frequency of all {@link TermState} - * instances passed to {@link #register(TermState, int, int)}. + * instances passed to {@link #register(TermState, int, int, long)}. * @return the accumulated document frequency of all {@link TermState} - * instances passed to {@link #register(TermState, int, int)}. + * instances passed to {@link #register(TermState, int, int, long)}. */ public int docFreq() { return docFreq; } + + /** + * Returns the accumulated term frequency of all {@link TermState} + * instances passed to {@link #register(TermState, int, int, long)}. + * @return the accumulated term frequency of all {@link TermState} + * instances passed to {@link #register(TermState, int, int, long)}. + */ + public long totalTermFreq() { + return totalTermFreq; + } + + /** expert: only available for queries that want to lie about docfreq + * @lucene.internal */ + public void setDocFreq(int docFreq) { + this.docFreq = docFreq; + } } \ No newline at end of file diff --git a/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java b/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java index 41541264955..3573ffc6fb5 100644 --- a/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java +++ b/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java @@ -62,12 +62,7 @@ public class AssertingIndexSearcher extends IndexSearcher { } @Override - public float getValue() { - return w.getValue(); - } - - @Override - public void normalize(float norm) { + public void normalize(float norm, float topLevelBoost) { throw new IllegalStateException("Weight already normalized."); } @@ -77,7 +72,7 @@ public class AssertingIndexSearcher extends IndexSearcher { } @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { throw new IllegalStateException("Weight already normalized."); } diff --git a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java index 6f1d333cc3d..36362555e33 100644 --- a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java +++ b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java @@ -329,9 +329,10 @@ public class CheckHits { Explanation detail[] = expl.getDetails(); if (detail!=null) { if (detail.length==1) { - // simple containment, no matter what the description says, + // simple containment, unless its a freq of: (which lets a query explain how the freq is calculated), // just verify contained expl has same score - verifyExplanation(q,doc,score,deep,detail[0]); + if (!expl.getDescription().endsWith("with freq of:")) + verifyExplanation(q,doc,score,deep,detail[0]); } else { // explanation must either: // - end with one of: "product of:", "sum of:", "max of:", or @@ -357,6 +358,7 @@ public class CheckHits { } } } + // TODO: this is a TERRIBLE assertion!!!! Assert.assertTrue( q+": multi valued explanation description=\""+descr +"\" must be 'max of plus x times others' or end with 'product of'" diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index bdd53b3e9aa..d1dbefeee6a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; @@ -375,7 +374,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", sim.encodeNormValue(2.0f)); reader.close(); searcher.close(); @@ -421,7 +421,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(22, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(22, "content", sim.encodeNormValue(2.0f)); reader.close(); // make sure they "took": @@ -483,7 +484,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); } @@ -526,7 +528,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java index f46fd29fbe8..9c4994ebb43 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -655,7 +656,8 @@ public class TestDeletionPolicy extends LuceneTestCase { writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3*i+1); - reader.setNorm(4*i+1, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(4*i+1, "content", sim.encodeNormValue(2.0F)); IndexSearcher searcher = newSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16*(1+i), hits.length); @@ -781,7 +783,8 @@ public class TestDeletionPolicy extends LuceneTestCase { writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3); - reader.setNorm(5, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(5, "content", sim.encodeNormValue(2.0F)); IndexSearcher searcher = newSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16, hits.length); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java index 6ea632e1341..fd72f4510ca 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java @@ -71,7 +71,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { Term searchTerm = new Term("id", "7"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("didn't delete the right number of documents", 1, delCount); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); // Set one norm so we get a .s0 file: reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index 05b2f3c951b..7965406ff98 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -421,7 +421,7 @@ public class TestIndexReader extends LuceneTestCase // expected } - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm after close failed to throw IOException"); @@ -462,7 +462,7 @@ public class TestIndexReader extends LuceneTestCase // expected } - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm should have hit LockObtainFailedException"); @@ -494,7 +494,7 @@ public class TestIndexReader extends LuceneTestCase // now open reader & set norm for doc 0 IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); // we should be holding the write lock now: @@ -539,7 +539,7 @@ public class TestIndexReader extends LuceneTestCase addDoc(writer, searchTerm.text()); writer.close(); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); // now open reader & set norm for doc 0 (writes to // _0_1.s0) reader = IndexReader.open(dir, false); @@ -738,7 +738,7 @@ public class TestIndexReader extends LuceneTestCase } reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(1, "content", sim.encodeNormValue(2.0f)); fail("did not hit exception when calling setNorm on an invalid doc number"); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java index bcbf857a195..9b6c4d24fd2 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java @@ -273,7 +273,7 @@ public class TestIndexReaderClone extends LuceneTestCase { * @throws Exception */ private void performDefaultTests(IndexReader r1) throws Exception { - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]); IndexReader pr1Clone = (IndexReader) r1.clone(); @@ -329,7 +329,7 @@ public class TestIndexReaderClone extends LuceneTestCase { TestIndexReaderReopen.createIndex(random, dir1, false); SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false)); origSegmentReader.deleteDocument(1); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f)); SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader @@ -429,7 +429,7 @@ public class TestIndexReaderClone extends LuceneTestCase { final Directory dir1 = newDirectory(); TestIndexReaderReopen.createIndex(random, dir1, false); IndexReader orig = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); orig.setNorm(1, "field1", sim.encodeNormValue(17.0f)); final byte encoded = sim.encodeNormValue(17.0f); assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java index 32cef3ea5db..228d03331da 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java @@ -47,9 +47,9 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // diable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } }; } @@ -217,7 +217,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { IndexReader reader4C = (IndexReader) reader3C.clone(); SegmentReader segmentReader4C = getOnlySegmentReader(reader4C); assertEquals(4, reader3CCNorm.bytesRef().get()); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f)); // generate a cannot update exception in reader1 @@ -278,7 +278,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { // System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm)); ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm)); // System.out.println("setNorm i: "+i); @@ -300,7 +300,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { assertEquals("number of norms mismatches", numDocNorms, b.length); ArrayList storedNorms = (i == 1 ? modifiedNorms : norms); for (int j = 0; j < b.length; j++) { - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); float norm = sim.decodeNormValue(b[j]); float norm1 = storedNorms.get(j).floatValue(); assertEquals("stored norm value of " + field + " for doc " + j + " is " @@ -340,7 +340,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { // return unique norm values that are unchanged by encoding/decoding private float nextNorm(String fname) { float norm = lastNorm + normDelta; - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); do { float norm1 = sim.decodeNormValue( sim.encodeNormValue(norm)); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java index d17457fe00a..791a14fdb79 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java @@ -131,7 +131,7 @@ public class TestIndexReaderOnDiskFull extends LuceneTestCase { dir.setMaxSizeInBytes(thisDiskFree); dir.setRandomIOExceptionRate(rate); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { if (0 == x) { int docId = 12; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java index f9277eab9a9..3200204df5f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java @@ -606,7 +606,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { IndexReader reader2 = reader1.reopen(); modifier = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(1, "field1", sim.encodeNormValue(50f)); modifier.setNorm(1, "field2", sim.encodeNormValue(50f)); modifier.close(); @@ -702,7 +702,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { protected void modifyIndex(int i) throws IOException { if (i % 3 == 0) { IndexReader modifier = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(i, "field1", sim.encodeNormValue(50f)); modifier.close(); } else if (i % 3 == 1) { @@ -983,7 +983,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { } case 1: { IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(4, "field1", sim.encodeNormValue(123f)); reader.setNorm(44, "field2", sim.encodeNormValue(222f)); reader.setNorm(44, "field4", sim.encodeNormValue(22f)); @@ -1007,7 +1007,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { } case 4: { IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(5, "field1", sim.encodeNormValue(123f)); reader.setNorm(55, "field2", sim.encodeNormValue(222f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java index d81d3a404be..9744008ece6 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java +++ b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java @@ -116,8 +116,8 @@ public class TestMaxTermFrequency extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { - return (float) state.getMaxTermFrequency(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue((float) state.getMaxTermFrequency()); } } } diff --git a/lucene/src/test/org/apache/lucene/index/TestNorms.java b/lucene/src/test/org/apache/lucene/index/TestNorms.java index 3a8b295f287..372ae2ef964 100755 --- a/lucene/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestNorms.java @@ -46,9 +46,9 @@ public class TestNorms extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // diable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } }; } @@ -177,7 +177,7 @@ public class TestNorms extends LuceneTestCase { //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm)); ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm)); } @@ -192,8 +192,9 @@ public class TestNorms extends LuceneTestCase { byte b[] = MultiNorms.norms(ir, field); assertEquals("number of norms mismatches",numDocNorms,b.length); ArrayList storedNorms = (i==1 ? modifiedNorms : norms); + DefaultSimilarity sim = (DefaultSimilarity) similarityProviderOne.get(field); for (int j = 0; j < b.length; j++) { - float norm = similarityProviderOne.get(field).decodeNormValue(b[j]); + float norm = sim.decodeNormValue(b[j]); float norm1 = storedNorms.get(j).floatValue(); assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); } @@ -229,7 +230,7 @@ public class TestNorms extends LuceneTestCase { // return unique norm values that are unchanged by encoding/decoding private float nextNorm(String fname) { float norm = lastNorm + normDelta; - Similarity similarity = similarityProviderOne.get(fname); + DefaultSimilarity similarity = (DefaultSimilarity) similarityProviderOne.get(fname); do { float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm)); if (norm1 > lastNorm) { @@ -259,8 +260,8 @@ public class TestNorms extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { - return (float) state.getLength(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue((float) state.getLength()); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java index cf7ecbd16ef..efef48a9729 100644 --- a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java +++ b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java @@ -18,9 +18,9 @@ package org.apache.lucene.index; */ import java.io.IOException; -import java.util.Collection; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -30,7 +30,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; -import org.apache.lucene.search.Explanation.IDFExplanation; public class TestOmitTf extends LuceneTestCase { @@ -39,23 +38,14 @@ public class TestOmitTf extends LuceneTestCase { public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; } public Similarity get(String field) { - return new Similarity() { + return new TFIDFSimilarity() { - @Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } + @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; } - @Override public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - @Override - public String explain() { - return "Inexplicable"; - } - }; + @Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java index 6b5dc4eea04..b2d0b3cd26a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java @@ -149,7 +149,7 @@ public class TestParallelReader extends LuceneTestCase { assertTrue(pr.isCurrent()); IndexReader modifier = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(0, "f1", sim.encodeNormValue(100f)); modifier.close(); diff --git a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java index 13e7f8145c3..167d10e696c 100644 --- a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -20,7 +20,11 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Similarity.ExactDocScorer; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.Similarity.Stats; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.PriorityQueue; @@ -187,8 +191,8 @@ final class JustCompileSearch { static final class JustCompilePhraseScorer extends PhraseScorer { JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { - super(weight, postings, similarity, norms); + Similarity.SloppyDocScorer docScorer) throws IOException { + super(weight, postings, docScorer); } @Override @@ -243,12 +247,22 @@ final class JustCompileSearch { static final class JustCompileSimilarity extends Similarity { @Override - public float idf(int docFreq, int numDocs) { + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @Override - public float computeNorm(FieldInvertState state) { + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } + + @Override + public byte computeNorm(FieldInvertState state) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @@ -256,11 +270,6 @@ final class JustCompileSearch { public float sloppyFreq(int distance) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } - - @Override - public float tf(float freq) { - throw new UnsupportedOperationException(UNSUPPORTED_MSG); - } } static final class JustCompileSimilarityProvider implements SimilarityProvider { @@ -348,17 +357,12 @@ final class JustCompileSearch { } @Override - public float getValue() { + public void normalize(float norm, float topLevelBoost) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @Override - public void normalize(float norm) { - throw new UnsupportedOperationException(UNSUPPORTED_MSG); - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } diff --git a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java index e8a6b69a948..71b96a45a7a 100644 --- a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java @@ -62,9 +62,9 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // Disable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java new file mode 100644 index 00000000000..2281000eff3 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java @@ -0,0 +1,203 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IndexDocValuesField; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; + +/** + * Tests the use of indexdocvalues in scoring. + * + * In the example, a docvalues field is used as a per-document boost (separate from the norm) + * @lucene.experimental + */ +public class TestDocValuesScoring extends LuceneTestCase { + private static final float SCORE_EPSILON = 0.001f; /* for comparing floats */ + + public void testSimple() throws Exception { + assumeFalse("PreFlex codec cannot work with IndexDocValues!", + "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec())); + + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + IndexDocValuesField dvField = new IndexDocValuesField("foo_boost"); + doc.add(dvField); + Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field2); + + field.setValue("quick brown fox"); + field2.setValue("quick brown fox"); + dvField.setFloat(2f); // boost x2 + iw.addDocument(doc); + field.setValue("jumps over lazy brown dog"); + field2.setValue("jumps over lazy brown dog"); + dvField.setFloat(4f); // boost x4 + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + // no boosting + IndexSearcher searcher1 = newSearcher(ir); + // boosting + IndexSearcher searcher2 = newSearcher(ir); + searcher2.setSimilarityProvider(new DefaultSimilarityProvider() { + final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost"); + + public Similarity get(String field) { + return "foo".equals(field) ? fooSim : super.get(field); + } + }); + + // in this case, we searched on field "foo". first document should have 2x the score. + TermQuery tq = new TermQuery(new Term("foo", "quick")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + TopDocs noboost = searcher1.search(tq, 10); + TopDocs boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + //System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc)); + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*2f, SCORE_EPSILON); + + // this query matches only the second document, which should have 4x the score. + tq = new TermQuery(new Term("foo", "jumps")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + noboost = searcher1.search(tq, 10); + boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*4f, SCORE_EPSILON); + + // search on on field bar just for kicks, nothing should happen, since we setup + // our sim provider to only use foo_boost for field foo. + tq = new TermQuery(new Term("bar", "quick")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + noboost = searcher1.search(tq, 10); + boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON); + + + searcher1.close(); + searcher2.close(); + ir.close(); + dir.close(); + } + + /** + * Similarity that wraps another similarity and boosts the final score + * according to whats in a docvalues field. + * + * @lucene.experimental + */ + static class BoostingSimilarity extends Similarity { + private final Similarity sim; + private final String boostField; + + public BoostingSimilarity(Similarity sim, String boostField) { + this.sim = sim; + this.boostField = boostField; + } + + @Override + public byte computeNorm(FieldInvertState state) { + return sim.computeNorm(state); + } + + @Override + public float sloppyFreq(int distance) { + return sim.sloppyFreq(distance); + } + + @Override + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { + return sim.computeStats(searcher, fieldName, queryBoost, termContexts); + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + final ExactDocScorer sub = sim.exactDocScorer(stats, fieldName, context); + final Source values = context.reader.docValues(boostField).getSource(); + + return new ExactDocScorer() { + @Override + public float score(int doc, int freq) { + return (float) values.getFloat(doc) * sub.score(doc, freq); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")"); + Explanation simExplanation = sub.explain(doc, freq); + Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:"); + expl.addDetail(boostExplanation); + expl.addDetail(simExplanation); + return expl; + } + }; + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + final SloppyDocScorer sub = sim.sloppyDocScorer(stats, fieldName, context); + final Source values = context.reader.docValues(boostField).getSource(); + + return new SloppyDocScorer() { + @Override + public float score(int doc, float freq) { + return (float) values.getFloat(doc) * sub.score(doc, freq); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")"); + Explanation simExplanation = sub.explain(doc, freq); + Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:"); + expl.addDetail(boostExplanation); + expl.addDetail(simExplanation); + return expl; + } + }; + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java b/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java index 634435844d1..c60a8becae8 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java @@ -49,34 +49,12 @@ public class TestMatchAllDocsQuery extends LuceneTestCase { IndexSearcher is = newSearcher(ir); ScoreDoc[] hits; - // assert with norms scoring turned off - hits = is.search(new MatchAllDocsQuery(), null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals("one", is.doc(hits[0].doc).get("key")); assertEquals("two", is.doc(hits[1].doc).get("key")); assertEquals("three four", is.doc(hits[2].doc).get("key")); - // assert with norms scoring turned on - - MatchAllDocsQuery normsQuery = new MatchAllDocsQuery("key"); - hits = is.search(normsQuery, null, 1000).scoreDocs; - assertEquals(3, hits.length); - - assertEquals("three four", is.doc(hits[0].doc).get("key")); - assertEquals("two", is.doc(hits[1].doc).get("key")); - assertEquals("one", is.doc(hits[2].doc).get("key")); - - // change norm & retest - is.getIndexReader().setNorm(0, "key", is.getSimilarityProvider().get("key").encodeNormValue(400f)); - normsQuery = new MatchAllDocsQuery("key"); - hits = is.search(normsQuery, null, 1000).scoreDocs; - assertEquals(3, hits.length); - - assertEquals("one", is.doc(hits[0].doc).get("key")); - assertEquals("three four", is.doc(hits[1].doc).get("key")); - assertEquals("two", is.doc(hits[2].doc).get("key")); - // some artificial queries to trigger the use of skipTo(): BooleanQuery bq = new BooleanQuery(); diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index 02b876e0204..c434b1d9c87 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -24,9 +24,9 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -312,21 +312,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase { return new DefaultSimilarity() { @Override - public IDFExplanation idfExplain(Collection terms, + public Explanation idfExplain(TermContext stats[], IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - - @Override - public float getIdf() { - return 10f; - } - - @Override - public String explain() { - return "just a test"; - } - - }; + return new Explanation(10f, "just a test"); } }; } @@ -336,7 +324,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase { query.add(new Term[] { new Term("body", "this"), new Term("body", "that") }); query.add(new Term("body", "is")); Weight weight = query.createWeight(searcher); - assertEquals(10f * 10f, weight.sumOfSquaredWeights(), 0.001f); + assertEquals(10f * 10f, weight.getValueForNormalization(), 0.001f); writer.close(); searcher.close(); diff --git a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java index 906aeb039b9..72245e1207f 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java +++ b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java @@ -50,7 +50,7 @@ public class TestSetNorm extends LuceneTestCase { // reset the boost of each instance of this document IndexReader reader = IndexReader.open(store, false); - Similarity similarity = new DefaultSimilarity(); + DefaultSimilarity similarity = new DefaultSimilarity(); reader.setNorm(0, "field", similarity.encodeNormValue(1.0f)); reader.setNorm(1, "field", similarity.encodeNormValue(2.0f)); reader.setNorm(2, "field", similarity.encodeNormValue(4.0f)); diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java index 3afeb25566f..55c62248a53 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java @@ -18,8 +18,9 @@ package org.apache.lucene.search; */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; + import java.io.IOException; -import java.util.Collection; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; @@ -30,7 +31,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.search.Explanation.IDFExplanation; /** Similarity unit test. * @@ -42,22 +42,13 @@ public class TestSimilarity extends LuceneTestCase { public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; } public Similarity get(String field) { - return new Similarity() { - @Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } + return new DefaultSimilarity() { + @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; } - @Override public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - @Override - public String explain() { - return "Inexplicable"; - } - }; + @Override public Explanation idfExplain(TermContext[] stats, IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java index 7a9d6410863..1bf30e3b773 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java @@ -105,10 +105,10 @@ public class TestSimilarityProvider extends LuceneTestCase { } } - private class Sim1 extends Similarity { + private class Sim1 extends TFIDFSimilarity { @Override - public float computeNorm(FieldInvertState state) { - return 1f; + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(1f); } @Override @@ -127,10 +127,10 @@ public class TestSimilarityProvider extends LuceneTestCase { } } - private class Sim2 extends Similarity { + private class Sim2 extends TFIDFSimilarity { @Override - public float computeNorm(FieldInvertState state) { - return 10f; + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(10f); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java index 5c115d5cf9f..962eab069c2 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.payloads; */ import java.io.IOException; import java.io.Reader; -import java.util.Collection; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -45,7 +44,7 @@ import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.util.TermContext; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -325,8 +324,8 @@ public class TestPayloadNearQuery extends LuceneTestCase { //Make everything else 1 so we see the effect of the payload //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost()); } @Override @@ -341,18 +340,8 @@ public class TestPayloadNearQuery extends LuceneTestCase { // idf used for phrase queries @Override - public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - - @Override - public String explain() { - return "Inexplicable"; - } - }; + public Explanation idfExplain(TermContext states[], IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java index 9ed0db35a44..ea35f60cb56 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java @@ -318,8 +318,8 @@ public class TestPayloadTermQuery extends LuceneTestCase { //Make everything else 1 so we see the effect of the payload //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java index ac0f45d767b..4adceca1bb7 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java +++ b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java @@ -135,8 +135,8 @@ final class JustCompileSearchSpans { static final class JustCompileSpanScorer extends SpanScorer { protected JustCompileSpanScorer(Spans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); } @Override diff --git a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java index a066d1eba26..edc1516be98 100644 --- a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java +++ b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java @@ -133,18 +133,13 @@ public class BlockJoinQuery extends Query { } @Override - public float getValue() { - return childWeight.getValue(); + public float getValueForNormalization() throws IOException { + return childWeight.getValueForNormalization(); } @Override - public float sumOfSquaredWeights() throws IOException { - return childWeight.sumOfSquaredWeights(); - } - - @Override - public void normalize(float norm) { - childWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + childWeight.normalize(norm, topLevelBoost); } @Override diff --git a/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java b/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java index 7842a90e115..0d24612412d 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java @@ -195,21 +195,14 @@ public class CustomScoreQuery extends Query { return CustomScoreQuery.this; } - /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ @Override - public float getValue() { - return getBoost(); - } - - /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ - @Override - public float sumOfSquaredWeights() throws IOException { - float sum = subQueryWeight.sumOfSquaredWeights(); + public float getValueForNormalization() throws IOException { + float sum = subQueryWeight.getValueForNormalization(); for(int i = 0; i < valSrcWeights.length; i++) { if (qStrict) { - valSrcWeights[i].sumOfSquaredWeights(); // do not include ValueSource part in the query normalization + valSrcWeights[i].getValueForNormalization(); // do not include ValueSource part in the query normalization } else { - sum += valSrcWeights[i].sumOfSquaredWeights(); + sum += valSrcWeights[i].getValueForNormalization(); } } sum *= getBoost() * getBoost(); // boost each sub-weight @@ -218,14 +211,14 @@ public class CustomScoreQuery extends Query { /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ @Override - public void normalize(float norm) { - norm *= getBoost(); // incorporate boost - subQueryWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); // incorporate boost + subQueryWeight.normalize(norm, topLevelBoost); for(int i = 0; i < valSrcWeights.length; i++) { if (qStrict) { - valSrcWeights[i].normalize(1); // do not normalize the ValueSource part + valSrcWeights[i].normalize(1, 1); // do not normalize the ValueSource part } else { - valSrcWeights[i].normalize(norm); + valSrcWeights[i].normalize(norm, topLevelBoost); } } } @@ -245,7 +238,7 @@ public class CustomScoreQuery extends Query { for(int i = 0; i < valSrcScorers.length; i++) { valSrcScorers[i] = valSrcWeights[i].scorer(context, scorerContext.scoreDocsInOrder(true)); } - return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, subQueryScorer, valSrcScorers); + return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, getBoost(), subQueryScorer, valSrcScorers); } @Override @@ -265,11 +258,11 @@ public class CustomScoreQuery extends Query { valSrcExpls[i] = valSrcWeights[i].explain(info, doc); } Explanation customExp = CustomScoreQuery.this.getCustomScoreProvider(info).customExplain(doc,subQueryExpl,valSrcExpls); - float sc = getValue() * customExp.getValue(); + float sc = getBoost() * customExp.getValue(); Explanation res = new ComplexExplanation( true, sc, CustomScoreQuery.this.toString() + ", product of:"); res.addDetail(customExp); - res.addDetail(new Explanation(getValue(), "queryBoost")); // actually using the q boost as q weight (== weight value) + res.addDetail(new Explanation(getBoost(), "queryBoost")); // actually using the q boost as q weight (== weight value) return res; } @@ -294,10 +287,10 @@ public class CustomScoreQuery extends Query { private float vScores[]; // reused in score() to avoid allocating this array for each doc // constructor - private CustomScorer(CustomScoreProvider provider, CustomWeight w, + private CustomScorer(CustomScoreProvider provider, CustomWeight w, float qWeight, Scorer subQueryScorer, Scorer[] valSrcScorers) throws IOException { super(w); - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.subQueryScorer = subQueryScorer; this.valSrcScorers = valSrcScorers; this.vScores = new float[valSrcScorers.length]; diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java b/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java index 1fafb077252..3e04f55ae72 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java @@ -78,21 +78,16 @@ public class BoostedQuery extends Query { } @Override - public float getValue() { - return getBoost(); - } - - @Override - public float sumOfSquaredWeights() throws IOException { - float sum = qWeight.sumOfSquaredWeights(); + public float getValueForNormalization() throws IOException { + float sum = qWeight.getValueForNormalization(); sum *= getBoost() * getBoost(); return sum ; } @Override - public void normalize(float norm) { - norm *= getBoost(); - qWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); + qWeight.normalize(norm, topLevelBoost); } @Override @@ -101,7 +96,7 @@ public class BoostedQuery extends Query { if(subQueryScorer == null) { return null; } - return new BoostedQuery.CustomScorer(context, this, subQueryScorer, boostVal); + return new BoostedQuery.CustomScorer(context, this, getBoost(), subQueryScorer, boostVal); } @Override @@ -128,11 +123,11 @@ public class BoostedQuery extends Query { private final DocValues vals; private final AtomicReaderContext readerContext; - private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, + private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, float qWeight, Scorer scorer, ValueSource vs) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.scorer = scorer; this.readerContext = readerContext; this.vals = vs.getValues(weight.fcontext, readerContext); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java b/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java index ffeba130ae6..65383752569 100644 --- a/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java @@ -77,25 +77,20 @@ public class FunctionQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new AllScorer(context, this); + return new AllScorer(context, this, queryWeight); } @Override @@ -114,10 +109,10 @@ public class FunctionQuery extends Query { final boolean hasDeletions; final Bits liveDocs; - public AllScorer(AtomicReaderContext context, FunctionWeight w) throws IOException { + public AllScorer(AtomicReaderContext context, FunctionWeight w, float qWeight) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.reader = context.reader; this.maxDoc = reader.maxDoc(); this.hasDeletions = reader.hasDeletions(); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java index 23ccd22cd89..b6a53416f37 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -42,9 +43,11 @@ public class IDFValueSource extends DocFreqValueSource { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); Similarity sim = searcher.getSimilarityProvider().get(field); - // todo: we need docFreq that takes a BytesRef - int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString())); - float idf = sim.idf(docfreq, searcher.maxDoc()); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes)); + float idf = ((TFIDFSimilarity)sim).idf(docfreq, searcher.maxDoc()); return new ConstDoubleDocValues(idf, this); } } diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java index 5a515ad48c9..f2b5436bb6f 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java @@ -23,6 +23,8 @@ import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; + import java.io.IOException; import java.util.Map; @@ -49,7 +51,11 @@ public class NormValueSource extends ValueSource { @Override public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); - final Similarity similarity = searcher.getSimilarityProvider().get(field); + Similarity sim = searcher.getSimilarityProvider().get(field); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + final TFIDFSimilarity similarity = (TFIDFSimilarity) sim; final byte[] norms = readerContext.reader.norms(field); if (norms == null) { return new ConstDoubleDocValues(0.0, this); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java index d868456f8c5..90b605bc25c 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java @@ -24,6 +24,7 @@ import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -43,7 +44,11 @@ public class TFValueSource extends TermFreqValueSource { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { Fields fields = readerContext.reader.fields(); final Terms terms = fields.terms(field); - final Similarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field); + final Similarity sim = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + final TFIDFSimilarity similarity = (TFIDFSimilarity) sim; return new FloatDocValues(this) { DocsEnum docs ; diff --git a/solr/src/java/org/apache/solr/schema/LatLonType.java b/solr/src/java/org/apache/solr/schema/LatLonType.java index dbcd9588c8a..b3956178601 100644 --- a/solr/src/java/org/apache/solr/schema/LatLonType.java +++ b/solr/src/java/org/apache/solr/schema/LatLonType.java @@ -354,25 +354,20 @@ class SpatialDistanceQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new SpatialScorer(context, this); + return new SpatialScorer(context, this, queryWeight); } @Override @@ -405,10 +400,10 @@ class SpatialDistanceQuery extends Query { int lastDistDoc; double lastDist; - public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w) throws IOException { + public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w, float qWeight) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.reader = readerContext.reader; this.maxDoc = reader.maxDoc(); this.liveDocs = reader.getLiveDocs(); diff --git a/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java index 4f188ea15a1..61da1b1b9d4 100644 --- a/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java +++ b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -168,19 +168,15 @@ class JoinQuery extends Query { return JoinQuery.this; } - public float getValue() { - return getBoost(); - } - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @@ -223,7 +219,7 @@ class JoinQuery extends Query { DocIdSet readerSet = filter.getDocIdSet(context); if (readerSet == null) readerSet=DocIdSet.EMPTY_DOCIDSET; - return new JoinScorer(this, readerSet.iterator()); + return new JoinScorer(this, readerSet.iterator(), getBoost()); } @@ -514,9 +510,9 @@ class JoinQuery extends Query { final float score; int doc = -1; - public JoinScorer(Weight w, DocIdSetIterator iter) throws IOException { + public JoinScorer(Weight w, DocIdSetIterator iter, float score) throws IOException { super(w); - score = w.getValue(); + this.score = score; this.iter = iter==null ? DocIdSet.EMPTY_DOCIDSET.iterator() : iter; } diff --git a/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java b/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java index fd41c32b8f2..2880302e03f 100755 --- a/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java +++ b/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java @@ -106,31 +106,26 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new ConstantScorer(context, this); + return new ConstantScorer(context, this, queryWeight); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - ConstantScorer cs = new ConstantScorer(context, this); + ConstantScorer cs = new ConstantScorer(context, this, queryWeight); boolean exists = cs.docIdSetIterator.advance(doc) == doc; ComplexExplanation result = new ComplexExplanation(); @@ -157,9 +152,9 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend final float theScore; int doc = -1; - public ConstantScorer(AtomicReaderContext context, ConstantWeight w) throws IOException { + public ConstantScorer(AtomicReaderContext context, ConstantWeight w, float theScore) throws IOException { super(w); - theScore = w.getValue(); + this.theScore = theScore; DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context); if (docIdSet == null) { docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); diff --git a/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java b/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java index ba86864ec8a..38ab99ec539 100755 --- a/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java +++ b/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java @@ -21,7 +21,7 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; @@ -305,7 +305,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 { assertQ(req("fl","*,score","q", "{!func}docfreq($field,$value)", "fq","id:6", "field","a_t", "value","cow"), "//float[@name='score']='3.0'"); assertQ(req("fl","*,score","q", "{!func}termfreq(a_t,cow)", "fq","id:6"), "//float[@name='score']='5.0'"); - Similarity similarity = new DefaultSimilarity(); + TFIDFSimilarity similarity = new DefaultSimilarity(); // make sure it doesn't get a NPE if no terms are present in a field. assertQ(req("fl","*,score","q", "{!func}termfreq(nofield_t,cow)", "fq","id:6"), "//float[@name='score']='0.0'"); @@ -323,7 +323,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 { state.setBoost(1.0f); state.setLength(4); assertQ(req("fl","*,score","q", "{!func}norm(a_t)", "fq","id:2"), - "//float[@name='score']='" + similarity.computeNorm(state) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte + "//float[@name='score']='" + similarity.decodeNormValue(similarity.computeNorm(state)) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte // test that ord and rord are working on a global index basis, not just // at the segment level (since Lucene 2.9 has switched to per-segment searching)