mirror of https://github.com/apache/lucene.git
LUCENE-4648: use only within-doc stats for PostingsHighlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1425992 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aa00eb2e70
commit
936a67bb7d
|
@ -17,9 +17,6 @@ package org.apache.lucene.sandbox.postingshighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
||||
/**
|
||||
* Used for ranking passages.
|
||||
* <p>
|
||||
|
@ -44,16 +41,17 @@ public class PassageScorer {
|
|||
public static final float pivot = 87f;
|
||||
|
||||
/**
|
||||
* Computes term importance, given its collection-wide statistics.
|
||||
* Computes term importance, given its in-document statistics.
|
||||
*
|
||||
* @param collectionStats statistics for the collection
|
||||
* @param termStats statistics for the term
|
||||
* @param contentLength length of document in characters
|
||||
* @param totalTermFreq number of time term occurs in document
|
||||
* @return term importance
|
||||
*/
|
||||
public float weight(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
long numDocs = collectionStats.maxDoc();
|
||||
long docFreq = termStats.docFreq();
|
||||
return (k1 + 1) * (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
|
||||
public float weight(int contentLength, int totalTermFreq) {
|
||||
// approximate #docs from content length
|
||||
float numDocs = 1 + contentLength / pivot;
|
||||
// numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
|
||||
return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -39,16 +39,12 @@ import org.apache.lucene.index.MultiReader;
|
|||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
@ -135,6 +131,7 @@ public final class PostingsHighlighter {
|
|||
SortedSet<Term> terms = new TreeSet<Term>();
|
||||
query.extractTerms(terms);
|
||||
terms = terms.subSet(floor, ceiling);
|
||||
Term termTexts[] = terms.toArray(new Term[terms.size()]);
|
||||
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
|
||||
|
||||
int docids[] = new int[scoreDocs.length];
|
||||
|
@ -156,24 +153,6 @@ public final class PostingsHighlighter {
|
|||
visitor.reset();
|
||||
}
|
||||
|
||||
// now pull index stats: TODO: we should probably pull this from the reader instead?
|
||||
// this could be a distributed call, which is crazy
|
||||
CollectionStatistics collectionStats = searcher.collectionStatistics(field);
|
||||
TermContext termContexts[] = new TermContext[terms.size()];
|
||||
Term termTexts[] = new Term[terms.size()]; // needed for seekExact
|
||||
float weights[] = new float[terms.size()];
|
||||
int upto = 0;
|
||||
for (Term term : terms) {
|
||||
termTexts[upto] = term;
|
||||
TermContext context = TermContext.build(readerContext, term, true);
|
||||
termContexts[upto] = context;
|
||||
TermStatistics termStats = searcher.termStatistics(term, context);
|
||||
weights[upto] = scorer.weight(collectionStats, termStats);
|
||||
upto++;
|
||||
// TODO: should we instead score all the documents term-at-a-time here?
|
||||
// the i/o would be better, but more transient ram
|
||||
}
|
||||
|
||||
BreakIterator bi = (BreakIterator)breakIterator.clone();
|
||||
|
||||
Map<Integer,String> highlights = new HashMap<Integer,String>();
|
||||
|
@ -201,7 +180,7 @@ public final class PostingsHighlighter {
|
|||
termsEnum = t.iterator(null);
|
||||
postings = new DocsAndPositionsEnum[terms.size()];
|
||||
}
|
||||
Passage passages[] = highlightDoc(termTexts, termContexts, subContext.ord, weights, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
if (passages.length > 0) {
|
||||
// otherwise a null snippet
|
||||
highlights.put(doc, formatter.format(passages, content));
|
||||
|
@ -219,9 +198,10 @@ public final class PostingsHighlighter {
|
|||
// algorithm: treat sentence snippets as miniature documents
|
||||
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
|
||||
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
|
||||
private Passage[] highlightDoc(Term termTexts[], TermContext[] terms, int ord, float[] weights,
|
||||
int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
|
||||
private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc,
|
||||
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
|
||||
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
|
||||
float weights[] = new float[terms.length];
|
||||
// initialize postings
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
DocsAndPositionsEnum de = postings[i];
|
||||
|
@ -230,11 +210,9 @@ public final class PostingsHighlighter {
|
|||
continue;
|
||||
} else if (de == null) {
|
||||
postings[i] = EMPTY; // initially
|
||||
TermState ts = terms[i].get(ord);
|
||||
if (ts == null) {
|
||||
continue;
|
||||
if (!termsEnum.seekExact(terms[i].bytes(), true)) {
|
||||
continue; // term not found
|
||||
}
|
||||
termsEnum.seekExact(termTexts[i].bytes(), ts);
|
||||
DocsAndPositionsEnum de2 = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
|
||||
if (de2 == null) {
|
||||
continue;
|
||||
|
@ -250,6 +228,7 @@ public final class PostingsHighlighter {
|
|||
}
|
||||
|
||||
if (doc == pDoc) {
|
||||
weights[i] = scorer.weight(contentLength, de.freq());
|
||||
de.nextPosition();
|
||||
pq.add(new OffsetsEnum(de, i));
|
||||
}
|
||||
|
@ -315,7 +294,7 @@ public final class PostingsHighlighter {
|
|||
int tf = 0;
|
||||
while (true) {
|
||||
tf++;
|
||||
current.addMatch(start, end, termTexts[off.id]);
|
||||
current.addMatch(start, end, terms[off.id]);
|
||||
if (off.pos == dp.freq()) {
|
||||
break; // removed from pq
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue