LUCENE-4860: per-field control over scoring and formatting

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1459816 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-22 14:50:11 +00:00
parent 97ec69bb69
commit 0e830fbae4
6 changed files with 115 additions and 49 deletions

View File

@ -130,6 +130,12 @@ New Features
* LUCENE-4752: New SortingMergePolicy (in lucene/misc) that sorts documents
before merging segments. (Adrien Grand, Shai Erera, David Smiley)
* LUCENE-4860: Customize scoring and formatting per-field in
PosthingsHighlighter by subclassing and overriding the getFormatter
and/or getScorer methods. This also changes Passage.getMatchTerms()
to return BytesRef[] instead of Term[]. (Robert Muir, Mike
McCandless)
API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use

View File

@ -17,8 +17,8 @@ package org.apache.lucene.search.postingshighlight;
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate;
@ -36,15 +36,15 @@ public final class Passage {
int matchStarts[] = new int[8];
int matchEnds[] = new int[8];
Term matchTerms[] = new Term[8];
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
void addMatch(int startOffset, int endOffset, Term term) {
void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
matchStarts = ArrayUtil.grow(matchStarts, numMatches+1);
matchEnds = ArrayUtil.grow(matchEnds, numMatches+1);
Term newMatchTerms[] = new Term[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
BytesRef newMatchTerms[] = new BytesRef[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
matchTerms = newMatchTerms;
}
@ -57,7 +57,7 @@ public final class Passage {
void sort() {
final int starts[] = matchStarts;
final int ends[] = matchEnds;
final Term terms[] = matchTerms;
final BytesRef terms[] = matchTerms;
new SorterTemplate() {
@Override
protected void swap(int i, int j) {
@ -69,7 +69,7 @@ public final class Passage {
ends[i] = ends[j];
ends[j] = temp;
Term tempTerm = terms[i];
BytesRef tempTerm = terms[i];
terms[i] = terms[j];
terms[j] = tempTerm;
}
@ -155,11 +155,11 @@ public final class Passage {
}
/**
* Term of the matches, corresponding with {@link #getMatchStarts()}.
* BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
* <p>
* Only {@link #getNumMatches()} are valid.
*/
public Term[] getMatchTerms() {
public BytesRef[] getMatchTerms() {
return matchTerms;
}
}

View File

@ -97,8 +97,14 @@ public class PostingsHighlighter {
private final int maxLength;
private final BreakIterator breakIterator;
private final PassageScorer scorer;
private final PassageFormatter formatter;
/** Set the first time {@link #getFormatter} is called,
* and then reused. */
private PassageFormatter defaultFormatter;
/** Set the first time {@link #getScorer} is called,
* and then reused. */
private PassageScorer defaultScorer;
/**
* Creates a new highlighter with default parameters.
@ -113,7 +119,7 @@ public class PostingsHighlighter {
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
*/
public PostingsHighlighter(int maxLength) {
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT));
}
/**
@ -122,11 +128,9 @@ public class PostingsHighlighter {
* @param breakIterator used for finding passage
* boundaries; pass null to highlight the entire
* content as a single Passage.
* @param scorer used for ranking passages.
* @param formatter used for formatting passages into highlighted snippets.
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
*/
public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
public PostingsHighlighter(int maxLength, BreakIterator breakIterator) {
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
@ -135,13 +139,30 @@ public class PostingsHighlighter {
if (breakIterator == null) {
breakIterator = new WholeBreakIterator();
}
if (scorer == null || formatter == null) {
throw new NullPointerException();
}
this.maxLength = maxLength;
this.breakIterator = breakIterator;
this.scorer = scorer;
this.formatter = formatter;
}
/** Returns the {@link PassageFormatter} to use for
* formatting passages into highlighted snippets. This
* returns a new {@code PassageFormatter} by default;
* subclasses can override to customize. */
protected PassageFormatter getFormatter(String field) {
if (defaultFormatter == null) {
defaultFormatter = new PassageFormatter();
}
return defaultFormatter;
}
/** Returns the {@link PassageScorer} to use for
* ranking passages. This
* returns a new {@code PassageScorer} by default;
* subclasses can override to customize. */
protected PassageScorer getScorer(String field) {
if (defaultScorer == null) {
defaultScorer = new PassageScorer();
}
return defaultScorer;
}
/**
@ -302,7 +323,13 @@ public class PostingsHighlighter {
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
// Strip off the redundant field:
BytesRef terms[] = new BytesRef[fieldTerms.size()];
int termUpto = 0;
for(Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
}
Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
String[] result = new String[docids.length];
@ -333,7 +360,7 @@ public class PostingsHighlighter {
return contents;
}
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
// reuse in the real sense... for docs in same segment we just advance our old enum
@ -341,6 +368,11 @@ public class PostingsHighlighter {
TermsEnum termsEnum = null;
int lastLeaf = -1;
PassageFormatter fieldFormatter = getFormatter(field);
if (fieldFormatter == null) {
throw new NullPointerException("PassageFormatter cannot be null");
}
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
if (content.length() == 0) {
@ -366,7 +398,7 @@ public class PostingsHighlighter {
if (passages.length > 0) {
// otherwise a null snippet (eg if field is missing
// entirely from the doc)
highlights.put(doc, formatter.format(passages, content));
highlights.put(doc, fieldFormatter.format(passages, content));
}
lastLeaf = leaf;
}
@ -377,8 +409,12 @@ public class PostingsHighlighter {
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc,
private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
PassageScorer scorer = getScorer(field);
if (scorer == null) {
throw new NullPointerException("PassageScorer cannot be null");
}
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
float weights[] = new float[terms.length];
// initialize postings
@ -389,7 +425,7 @@ public class PostingsHighlighter {
continue;
} else if (de == null) {
postings[i] = EMPTY; // initially
if (!termsEnum.seekExact(terms[i].bytes(), true)) {
if (!termsEnum.seekExact(terms[i], true)) {
continue; // term not found
}
de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);

View File

@ -457,7 +457,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
@ -527,7 +527,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) {
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null) {
@Override
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
assert fields.length == 1;
@ -636,7 +636,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[] {0};
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");

View File

@ -112,16 +112,26 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
for (int n = 1; n < maxTopN; n++) {
FakePassageFormatter f1 = new FakePassageFormatter();
final FakePassageFormatter f1 = new FakePassageFormatter();
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(),
f1);
FakePassageFormatter f2 = new FakePassageFormatter();
BreakIterator.getSentenceInstance(Locale.ROOT)) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f1;
}
};
final FakePassageFormatter f2 = new FakePassageFormatter();
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(),
f2);
BreakIterator.getSentenceInstance(Locale.ROOT)) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f2;
}
};
BooleanQuery bq = new BooleanQuery(false);
bq.add(query, BooleanClause.Occur.MUST);
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
@ -170,8 +180,7 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
// we use a very simple analyzer. so we can assert the matches are correct
int lastMatchStart = -1;
for (int i = 0; i < p.getNumMatches(); i++) {
Term term = p.getMatchTerms()[i];
assertEquals("body", term.field());
BytesRef term = p.getMatchTerms()[i];
int matchStart = p.getMatchStarts()[i];
assertTrue(matchStart >= 0);
// must at least start within the passage
@ -184,9 +193,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
// single character terms
assertEquals(matchStart+1, matchEnd);
// and the offsets must be correct...
BytesRef bytes = term.bytes();
assertEquals(1, bytes.length);
assertEquals((char)bytes.bytes[bytes.offset], Character.toLowerCase(content.charAt(matchStart)));
assertEquals(1, term.length);
assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
}
// record just the start/end offset for simplicity
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
@ -262,9 +270,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(1.2f, 0, 87),
new PassageFormatter());
BreakIterator.getSentenceInstance(Locale.ROOT)) {
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(1.2f, 0, 87);
}
};
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
@ -299,9 +310,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(0, 0.75f, 87),
new PassageFormatter());
BreakIterator.getSentenceInstance(Locale.ROOT)) {
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(0, 0.75f, 87);
}
};
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);

View File

@ -97,7 +97,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
if (pivot == null) {
pivot = "87";
}
PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
final PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
// formatter parameters: preTag/postTag/ellipsis
String preTag = attributes.get("preTag");
@ -112,7 +112,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
if (ellipsis == null) {
ellipsis = "... ";
}
PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
final PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
String summarizeEmpty = attributes.get("summarizeEmpty");
final boolean summarizeEmptyBoolean;
@ -127,7 +127,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
if (attributes.containsKey("maxLength")) {
maxLength = Integer.parseInt(attributes.get("maxLength"));
}
highlighter = new PostingsHighlighter(maxLength, breakIterator, scorer, formatter) {
highlighter = new PostingsHighlighter(maxLength, breakIterator) {
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
if (summarizeEmptyBoolean) {
@ -136,6 +136,16 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
return new Passage[0];
}
}
@Override
protected PassageFormatter getFormatter(String fieldName) {
return formatter;
}
@Override
protected PassageScorer getScorer(String fieldName) {
return scorer;
}
};
}