diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java
index 5a0aa76c6a1..1766af6b4c2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PassageScorer.java
@@ -17,9 +17,6 @@ package org.apache.lucene.sandbox.postingshighlight;
* limitations under the License.
*/
-import org.apache.lucene.search.CollectionStatistics;
-import org.apache.lucene.search.TermStatistics;
-
/**
* Used for ranking passages.
*
@@ -44,16 +41,17 @@ public class PassageScorer {
public static final float pivot = 87f;
/**
- * Computes term importance, given its collection-wide statistics.
+ * Computes term importance, given its in-document statistics.
*
- * @param collectionStats statistics for the collection
- * @param termStats statistics for the term
+ * @param contentLength length of document in characters
+ * @param totalTermFreq number of time term occurs in document
* @return term importance
*/
- public float weight(CollectionStatistics collectionStats, TermStatistics termStats) {
- long numDocs = collectionStats.maxDoc();
- long docFreq = termStats.docFreq();
- return (k1 + 1) * (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+ public float weight(int contentLength, int totalTermFreq) {
+ // approximate #docs from content length
+ float numDocs = 1 + contentLength / pivot;
+ // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
+ return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
}
/**
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java
index bae07941922..2d0beee983c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/postingshighlight/PostingsHighlighter.java
@@ -39,16 +39,12 @@ import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermContext;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
-import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
@@ -135,6 +131,7 @@ public final class PostingsHighlighter {
SortedSet terms = new TreeSet();
query.extractTerms(terms);
terms = terms.subSet(floor, ceiling);
+ Term termTexts[] = terms.toArray(new Term[terms.size()]);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
int docids[] = new int[scoreDocs.length];
@@ -156,24 +153,6 @@ public final class PostingsHighlighter {
visitor.reset();
}
- // now pull index stats: TODO: we should probably pull this from the reader instead?
- // this could be a distributed call, which is crazy
- CollectionStatistics collectionStats = searcher.collectionStatistics(field);
- TermContext termContexts[] = new TermContext[terms.size()];
- Term termTexts[] = new Term[terms.size()]; // needed for seekExact
- float weights[] = new float[terms.size()];
- int upto = 0;
- for (Term term : terms) {
- termTexts[upto] = term;
- TermContext context = TermContext.build(readerContext, term, true);
- termContexts[upto] = context;
- TermStatistics termStats = searcher.termStatistics(term, context);
- weights[upto] = scorer.weight(collectionStats, termStats);
- upto++;
- // TODO: should we instead score all the documents term-at-a-time here?
- // the i/o would be better, but more transient ram
- }
-
BreakIterator bi = (BreakIterator)breakIterator.clone();
Map highlights = new HashMap();
@@ -201,7 +180,7 @@ public final class PostingsHighlighter {
termsEnum = t.iterator(null);
postings = new DocsAndPositionsEnum[terms.size()];
}
- Passage passages[] = highlightDoc(termTexts, termContexts, subContext.ord, weights, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length > 0) {
// otherwise a null snippet
highlights.put(doc, formatter.format(passages, content));
@@ -219,9 +198,10 @@ public final class PostingsHighlighter {
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
- private Passage[] highlightDoc(Term termTexts[], TermContext[] terms, int ord, float[] weights,
- int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
+ private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc,
+ TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
PriorityQueue pq = new PriorityQueue();
+ float weights[] = new float[terms.length];
// initialize postings
for (int i = 0; i < terms.length; i++) {
DocsAndPositionsEnum de = postings[i];
@@ -230,11 +210,9 @@ public final class PostingsHighlighter {
continue;
} else if (de == null) {
postings[i] = EMPTY; // initially
- TermState ts = terms[i].get(ord);
- if (ts == null) {
- continue;
+ if (!termsEnum.seekExact(terms[i].bytes(), true)) {
+ continue; // term not found
}
- termsEnum.seekExact(termTexts[i].bytes(), ts);
DocsAndPositionsEnum de2 = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
if (de2 == null) {
continue;
@@ -250,6 +228,7 @@ public final class PostingsHighlighter {
}
if (doc == pDoc) {
+ weights[i] = scorer.weight(contentLength, de.freq());
de.nextPosition();
pq.add(new OffsetsEnum(de, i));
}
@@ -315,7 +294,7 @@ public final class PostingsHighlighter {
int tf = 0;
while (true) {
tf++;
- current.addMatch(start, end, termTexts[off.id]);
+ current.addMatch(start, end, terms[off.id]);
if (off.pos == dp.freq()) {
break; // removed from pq
} else {