diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java index 5a0aa76c6a1..1766af6b4c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java @@ -17,9 +17,6 @@ package org.apache.lucene.sandbox.postingshighlight; * limitations under the License. */ -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.TermStatistics; - /** * Used for ranking passages. *

@@ -44,16 +41,17 @@ public class PassageScorer { public static final float pivot = 87f; /** - * Computes term importance, given its collection-wide statistics. + * Computes term importance, given its in-document statistics. * - * @param collectionStats statistics for the collection - * @param termStats statistics for the term + * @param contentLength length of document in characters + * @param totalTermFreq number of time term occurs in document * @return term importance */ - public float weight(CollectionStatistics collectionStats, TermStatistics termStats) { - long numDocs = collectionStats.maxDoc(); - long docFreq = termStats.docFreq(); - return (k1 + 1) * (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D)); + public float weight(int contentLength, int totalTermFreq) { + // approximate #docs from content length + float numDocs = 1 + contentLength / pivot; + // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs + return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D)); } /** diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java index bae07941922..2d0beee983c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java @@ -39,16 +39,12 @@ import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; -import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; @@ -135,6 +131,7 @@ public final class PostingsHighlighter { SortedSet terms = new TreeSet(); query.extractTerms(terms); terms = terms.subSet(floor, ceiling); + Term termTexts[] = terms.toArray(new Term[terms.size()]); // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) int docids[] = new int[scoreDocs.length]; @@ -156,24 +153,6 @@ public final class PostingsHighlighter { visitor.reset(); } - // now pull index stats: TODO: we should probably pull this from the reader instead? - // this could be a distributed call, which is crazy - CollectionStatistics collectionStats = searcher.collectionStatistics(field); - TermContext termContexts[] = new TermContext[terms.size()]; - Term termTexts[] = new Term[terms.size()]; // needed for seekExact - float weights[] = new float[terms.size()]; - int upto = 0; - for (Term term : terms) { - termTexts[upto] = term; - TermContext context = TermContext.build(readerContext, term, true); - termContexts[upto] = context; - TermStatistics termStats = searcher.termStatistics(term, context); - weights[upto] = scorer.weight(collectionStats, termStats); - upto++; - // TODO: should we instead score all the documents term-at-a-time here? - // the i/o would be better, but more transient ram - } - BreakIterator bi = (BreakIterator)breakIterator.clone(); Map highlights = new HashMap(); @@ -201,7 +180,7 @@ public final class PostingsHighlighter { termsEnum = t.iterator(null); postings = new DocsAndPositionsEnum[terms.size()]; } - Passage passages[] = highlightDoc(termTexts, termContexts, subContext.ord, weights, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); if (passages.length > 0) { // otherwise a null snippet highlights.put(doc, formatter.format(passages, content)); @@ -219,9 +198,10 @@ public final class PostingsHighlighter { // algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) - private Passage[] highlightDoc(Term termTexts[], TermContext[] terms, int ord, float[] weights, - int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { + private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc, + TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { PriorityQueue pq = new PriorityQueue(); + float weights[] = new float[terms.length]; // initialize postings for (int i = 0; i < terms.length; i++) { DocsAndPositionsEnum de = postings[i]; @@ -230,11 +210,9 @@ public final class PostingsHighlighter { continue; } else if (de == null) { postings[i] = EMPTY; // initially - TermState ts = terms[i].get(ord); - if (ts == null) { - continue; + if (!termsEnum.seekExact(terms[i].bytes(), true)) { + continue; // term not found } - termsEnum.seekExact(termTexts[i].bytes(), ts); DocsAndPositionsEnum de2 = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); if (de2 == null) { continue; @@ -250,6 +228,7 @@ public final class PostingsHighlighter { } if (doc == pDoc) { + weights[i] = scorer.weight(contentLength, de.freq()); de.nextPosition(); pq.add(new OffsetsEnum(de, i)); } @@ -315,7 +294,7 @@ public final class PostingsHighlighter { int tf = 0; while (true) { tf++; - current.addMatch(start, end, termTexts[off.id]); + current.addMatch(start, end, terms[off.id]); if (off.pos == dp.freq()) { break; // removed from pq } else {