mirror of
https://github.com/apache/lucene.git
synced 2025-02-21 01:18:45 +00:00
LUCENE-4860: per-field control over scoring and formatting
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1459816 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
97ec69bb69
commit
0e830fbae4
@ -130,6 +130,12 @@ New Features
|
||||
* LUCENE-4752: New SortingMergePolicy (in lucene/misc) that sorts documents
|
||||
before merging segments. (Adrien Grand, Shai Erera, David Smiley)
|
||||
|
||||
* LUCENE-4860: Customize scoring and formatting per-field in
|
||||
PosthingsHighlighter by subclassing and overriding the getFormatter
|
||||
and/or getScorer methods. This also changes Passage.getMatchTerms()
|
||||
to return BytesRef[] instead of Term[]. (Robert Muir, Mike
|
||||
McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
||||
|
@ -17,8 +17,8 @@ package org.apache.lucene.search.postingshighlight;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.SorterTemplate;
|
||||
|
||||
@ -36,15 +36,15 @@ public final class Passage {
|
||||
|
||||
int matchStarts[] = new int[8];
|
||||
int matchEnds[] = new int[8];
|
||||
Term matchTerms[] = new Term[8];
|
||||
BytesRef matchTerms[] = new BytesRef[8];
|
||||
int numMatches = 0;
|
||||
|
||||
void addMatch(int startOffset, int endOffset, Term term) {
|
||||
void addMatch(int startOffset, int endOffset, BytesRef term) {
|
||||
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
|
||||
if (numMatches == matchStarts.length) {
|
||||
matchStarts = ArrayUtil.grow(matchStarts, numMatches+1);
|
||||
matchEnds = ArrayUtil.grow(matchEnds, numMatches+1);
|
||||
Term newMatchTerms[] = new Term[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
BytesRef newMatchTerms[] = new BytesRef[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
|
||||
matchTerms = newMatchTerms;
|
||||
}
|
||||
@ -57,7 +57,7 @@ public final class Passage {
|
||||
void sort() {
|
||||
final int starts[] = matchStarts;
|
||||
final int ends[] = matchEnds;
|
||||
final Term terms[] = matchTerms;
|
||||
final BytesRef terms[] = matchTerms;
|
||||
new SorterTemplate() {
|
||||
@Override
|
||||
protected void swap(int i, int j) {
|
||||
@ -69,7 +69,7 @@ public final class Passage {
|
||||
ends[i] = ends[j];
|
||||
ends[j] = temp;
|
||||
|
||||
Term tempTerm = terms[i];
|
||||
BytesRef tempTerm = terms[i];
|
||||
terms[i] = terms[j];
|
||||
terms[j] = tempTerm;
|
||||
}
|
||||
@ -155,11 +155,11 @@ public final class Passage {
|
||||
}
|
||||
|
||||
/**
|
||||
* Term of the matches, corresponding with {@link #getMatchStarts()}.
|
||||
* BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
|
||||
* <p>
|
||||
* Only {@link #getNumMatches()} are valid.
|
||||
*/
|
||||
public Term[] getMatchTerms() {
|
||||
public BytesRef[] getMatchTerms() {
|
||||
return matchTerms;
|
||||
}
|
||||
}
|
||||
|
@ -97,8 +97,14 @@ public class PostingsHighlighter {
|
||||
|
||||
private final int maxLength;
|
||||
private final BreakIterator breakIterator;
|
||||
private final PassageScorer scorer;
|
||||
private final PassageFormatter formatter;
|
||||
|
||||
/** Set the first time {@link #getFormatter} is called,
|
||||
* and then reused. */
|
||||
private PassageFormatter defaultFormatter;
|
||||
|
||||
/** Set the first time {@link #getScorer} is called,
|
||||
* and then reused. */
|
||||
private PassageScorer defaultScorer;
|
||||
|
||||
/**
|
||||
* Creates a new highlighter with default parameters.
|
||||
@ -113,7 +119,7 @@ public class PostingsHighlighter {
|
||||
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
|
||||
*/
|
||||
public PostingsHighlighter(int maxLength) {
|
||||
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
|
||||
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -122,11 +128,9 @@ public class PostingsHighlighter {
|
||||
* @param breakIterator used for finding passage
|
||||
* boundaries; pass null to highlight the entire
|
||||
* content as a single Passage.
|
||||
* @param scorer used for ranking passages.
|
||||
* @param formatter used for formatting passages into highlighted snippets.
|
||||
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
|
||||
*/
|
||||
public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
|
||||
public PostingsHighlighter(int maxLength, BreakIterator breakIterator) {
|
||||
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
|
||||
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
|
||||
// our sentinel in the offsets queue uses this value to terminate.
|
||||
@ -135,13 +139,30 @@ public class PostingsHighlighter {
|
||||
if (breakIterator == null) {
|
||||
breakIterator = new WholeBreakIterator();
|
||||
}
|
||||
if (scorer == null || formatter == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
this.maxLength = maxLength;
|
||||
this.breakIterator = breakIterator;
|
||||
this.scorer = scorer;
|
||||
this.formatter = formatter;
|
||||
}
|
||||
|
||||
/** Returns the {@link PassageFormatter} to use for
|
||||
* formatting passages into highlighted snippets. This
|
||||
* returns a new {@code PassageFormatter} by default;
|
||||
* subclasses can override to customize. */
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
if (defaultFormatter == null) {
|
||||
defaultFormatter = new PassageFormatter();
|
||||
}
|
||||
return defaultFormatter;
|
||||
}
|
||||
|
||||
/** Returns the {@link PassageScorer} to use for
|
||||
* ranking passages. This
|
||||
* returns a new {@code PassageScorer} by default;
|
||||
* subclasses can override to customize. */
|
||||
protected PassageScorer getScorer(String field) {
|
||||
if (defaultScorer == null) {
|
||||
defaultScorer = new PassageScorer();
|
||||
}
|
||||
return defaultScorer;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -302,7 +323,13 @@ public class PostingsHighlighter {
|
||||
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
|
||||
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
|
||||
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
|
||||
Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
|
||||
|
||||
// Strip off the redundant field:
|
||||
BytesRef terms[] = new BytesRef[fieldTerms.size()];
|
||||
int termUpto = 0;
|
||||
for(Term term : fieldTerms) {
|
||||
terms[termUpto++] = term.bytes();
|
||||
}
|
||||
Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
|
||||
|
||||
String[] result = new String[docids.length];
|
||||
@ -333,7 +360,7 @@ public class PostingsHighlighter {
|
||||
return contents;
|
||||
}
|
||||
|
||||
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
|
||||
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
|
||||
Map<Integer,String> highlights = new HashMap<Integer,String>();
|
||||
|
||||
// reuse in the real sense... for docs in same segment we just advance our old enum
|
||||
@ -341,6 +368,11 @@ public class PostingsHighlighter {
|
||||
TermsEnum termsEnum = null;
|
||||
int lastLeaf = -1;
|
||||
|
||||
PassageFormatter fieldFormatter = getFormatter(field);
|
||||
if (fieldFormatter == null) {
|
||||
throw new NullPointerException("PassageFormatter cannot be null");
|
||||
}
|
||||
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
String content = contents[i];
|
||||
if (content.length() == 0) {
|
||||
@ -366,7 +398,7 @@ public class PostingsHighlighter {
|
||||
if (passages.length > 0) {
|
||||
// otherwise a null snippet (eg if field is missing
|
||||
// entirely from the doc)
|
||||
highlights.put(doc, formatter.format(passages, content));
|
||||
highlights.put(doc, fieldFormatter.format(passages, content));
|
||||
}
|
||||
lastLeaf = leaf;
|
||||
}
|
||||
@ -377,8 +409,12 @@ public class PostingsHighlighter {
|
||||
// algorithm: treat sentence snippets as miniature documents
|
||||
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
|
||||
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
|
||||
private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc,
|
||||
private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
|
||||
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
|
||||
PassageScorer scorer = getScorer(field);
|
||||
if (scorer == null) {
|
||||
throw new NullPointerException("PassageScorer cannot be null");
|
||||
}
|
||||
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
|
||||
float weights[] = new float[terms.length];
|
||||
// initialize postings
|
||||
@ -389,7 +425,7 @@ public class PostingsHighlighter {
|
||||
continue;
|
||||
} else if (de == null) {
|
||||
postings[i] = EMPTY; // initially
|
||||
if (!termsEnum.seekExact(terms[i].bytes(), true)) {
|
||||
if (!termsEnum.seekExact(terms[i], true)) {
|
||||
continue; // term not found
|
||||
}
|
||||
de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
|
||||
|
@ -457,7 +457,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
@ -527,7 +527,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) {
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null) {
|
||||
@Override
|
||||
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
|
||||
assert fields.length == 1;
|
||||
@ -636,7 +636,7 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null);
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[] {0};
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
|
@ -112,16 +112,26 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
|
||||
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
|
||||
for (int n = 1; n < maxTopN; n++) {
|
||||
FakePassageFormatter f1 = new FakePassageFormatter();
|
||||
final FakePassageFormatter f1 = new FakePassageFormatter();
|
||||
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(),
|
||||
f1);
|
||||
FakePassageFormatter f2 = new FakePassageFormatter();
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT)) {
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
assertEquals("body", field);
|
||||
return f1;
|
||||
}
|
||||
};
|
||||
|
||||
final FakePassageFormatter f2 = new FakePassageFormatter();
|
||||
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(),
|
||||
f2);
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT)) {
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
assertEquals("body", field);
|
||||
return f2;
|
||||
}
|
||||
};
|
||||
|
||||
BooleanQuery bq = new BooleanQuery(false);
|
||||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
|
||||
@ -170,8 +180,7 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
// we use a very simple analyzer. so we can assert the matches are correct
|
||||
int lastMatchStart = -1;
|
||||
for (int i = 0; i < p.getNumMatches(); i++) {
|
||||
Term term = p.getMatchTerms()[i];
|
||||
assertEquals("body", term.field());
|
||||
BytesRef term = p.getMatchTerms()[i];
|
||||
int matchStart = p.getMatchStarts()[i];
|
||||
assertTrue(matchStart >= 0);
|
||||
// must at least start within the passage
|
||||
@ -184,9 +193,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
// single character terms
|
||||
assertEquals(matchStart+1, matchEnd);
|
||||
// and the offsets must be correct...
|
||||
BytesRef bytes = term.bytes();
|
||||
assertEquals(1, bytes.length);
|
||||
assertEquals((char)bytes.bytes[bytes.offset], Character.toLowerCase(content.charAt(matchStart)));
|
||||
assertEquals(1, term.length);
|
||||
assertEquals((char)term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
|
||||
}
|
||||
// record just the start/end offset for simplicity
|
||||
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
|
||||
@ -262,9 +270,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(1.2f, 0, 87),
|
||||
new PassageFormatter());
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT)) {
|
||||
@Override
|
||||
protected PassageScorer getScorer(String field) {
|
||||
return new PassageScorer(1.2f, 0, 87);
|
||||
}
|
||||
};
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
@ -299,9 +310,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(0, 0.75f, 87),
|
||||
new PassageFormatter());
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT)) {
|
||||
@Override
|
||||
protected PassageScorer getScorer(String field) {
|
||||
return new PassageScorer(0, 0.75f, 87);
|
||||
}
|
||||
};
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
|
||||
|
@ -97,7 +97,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
||||
if (pivot == null) {
|
||||
pivot = "87";
|
||||
}
|
||||
PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
|
||||
final PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
|
||||
|
||||
// formatter parameters: preTag/postTag/ellipsis
|
||||
String preTag = attributes.get("preTag");
|
||||
@ -112,7 +112,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
||||
if (ellipsis == null) {
|
||||
ellipsis = "... ";
|
||||
}
|
||||
PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
|
||||
final PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
|
||||
|
||||
String summarizeEmpty = attributes.get("summarizeEmpty");
|
||||
final boolean summarizeEmptyBoolean;
|
||||
@ -127,7 +127,7 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
||||
if (attributes.containsKey("maxLength")) {
|
||||
maxLength = Integer.parseInt(attributes.get("maxLength"));
|
||||
}
|
||||
highlighter = new PostingsHighlighter(maxLength, breakIterator, scorer, formatter) {
|
||||
highlighter = new PostingsHighlighter(maxLength, breakIterator) {
|
||||
@Override
|
||||
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
|
||||
if (summarizeEmptyBoolean) {
|
||||
@ -136,6 +136,16 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
||||
return new Passage[0];
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String fieldName) {
|
||||
return formatter;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageScorer getScorer(String fieldName) {
|
||||
return scorer;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user