mirror of
synced 2025-03-03 14:59:16 +00:00
LUCENE-4652: highlight multiple fields with postings highlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1428147 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
@ -61,6 +61,8 @@ public final class Passage {
* Start offset of this passage.
* @return start index (inclusive) of the passage in the
* original content: always >= 0.
public int getStartOffset() {
return startOffset;
@ -68,6 +70,8 @@ public final class Passage {
* End offset of this passage.
* @return end index (exclusive) of the passage in the
* original content: always >= {@link #getStartOffset()}
public int getEndOffset() {
return endOffset;
@ -91,6 +95,7 @@ public final class Passage {
* Start offsets of the term matches, in increasing order.
* <p>
* Only {@link #getNumMatches} are valid. Note that these
* offsets are absolute (not relative to {@link #getStartOffset()}).
@ -99,19 +104,20 @@ public final class Passage {
* End offsets of the term matches, corresponding with
* {@link #getMatchStarts}. Note that its possible that
* an end offset could exceed beyond the bounds of the passage
* ({@link #getEndOffset()}), if the Analyzer produced a term
* which spans a passage boundary.
* End offsets of the term matches, corresponding with {@link #getMatchStarts}.
* <p>
* Only {@link #getNumMatches} are valid. Note that its possible that an end offset
* could exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the
* Analyzer produced a term which spans a passage boundary.
public int[] getMatchEnds() {
return matchEnds;
* Term of the matches, corresponding with
* {@link #getMatchStarts()}.
* Term of the matches, corresponding with {@link #getMatchStarts()}.
* <p>
* Only {@link #getNumMatches()} are valid.
public Term[] getMatchTerms() {
return matchTerms;
@ -18,7 +18,7 @@ package org.apache.lucene.sandbox.postingshighlight;
* Constructs a formatted passage.
* Creates a formatted snippet from the top passages.
* <p>
* The default implementation marks the query terms as bold, and places
* ellipses between unconnected passages.
@ -26,6 +26,12 @@ package org.apache.lucene.sandbox.postingshighlight;
public class PassageFormatter {
* Formats the top <code>passages</code> from <code>content</code>
* into a human-readable text snippet.
* @param passages top-N passages for the field. Note these are sorted in
* the order that they appear in the document for convenience.
* @param content content for the field.
* @return formatted highlight
public String format(Passage passages[], String content) {
@ -18,7 +18,7 @@ package org.apache.lucene.sandbox.postingshighlight;
* Used for ranking passages.
* Ranks passages found by {@link PostingsHighlighter}.
* <p>
* Each passage is scored as a miniature document within the document.
* The final score is computed as {@link #norm} * ∑ ({@link #weight} * {@link #tf}).
@ -62,19 +62,15 @@ import org.apache.lucene.util.UnicodeUtil;
* Field body = new Field("body", "foobar", offsetsType);
* // retrieve highlights at query time
* PostingsHighlighter highlighter = new PostingsHighlighter("body");
* PostingsHighlighter highlighter = new PostingsHighlighter();
* Query query = new TermQuery(new Term("body", "highlighting"));
* TopDocs topDocs = searcher.search(query, n);
* String highlights[] = highlighter.highlight(query, searcher, topDocs);
* String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
* </pre>
* @lucene.experimental
public final class PostingsHighlighter {
// TODO: support highlighting multiple fields at once? someone is bound
// to try to use this in a slow way (invoking over and over for each field), which
// would be horrible.
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
// but if the analyzer is really fast and the field is tiny, this might really be
// unnecessary.
@ -86,25 +82,37 @@ public final class PostingsHighlighter {
* closer to the beginning of the document better summarize its content */
public static final int DEFAULT_MAX_LENGTH = 10000;
private final String field;
private final Term floor;
private final Term ceiling;
private final int maxLength;
private final BreakIterator breakIterator;
private final PassageScorer scorer;
private final PassageFormatter formatter;
public PostingsHighlighter(String field) {
this(field, DEFAULT_MAX_LENGTH);
* Creates a new highlighter with default parameters.
public PostingsHighlighter() {
public PostingsHighlighter(String field, int maxLength) {
this(field, maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
* Creates a new highlighter, specifying maximum content length.
* @param maxLength maximum content size to process.
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
public PostingsHighlighter(int maxLength) {
this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
public PostingsHighlighter(String field, int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
this.field = field;
if (maxLength == Integer.MAX_VALUE) {
* Creates a new highlighter with custom parameters.
* @param maxLength maximum content size to process.
* @param breakIterator used for finding passage boundaries.
* @param scorer used for ranking passages.
* @param formatter used for formatting passages into highlighted snippets.
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
@ -113,26 +121,107 @@ public final class PostingsHighlighter {
this.breakIterator = breakIterator;
this.scorer = scorer;
this.formatter = formatter;
floor = new Term(field, "");
ceiling = new Term(field, UnicodeUtil.BIG_TERM);
* Calls {@link #highlight(Query, IndexSearcher, TopDocs, int) highlight(query, searcher, topDocs, 1)}
* Highlights the top passages from a single field.
* @param field field name to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
public String[] highlight(Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
return highlight(query, searcher, topDocs, 1);
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
return highlight(field, query, searcher, topDocs, 1);
public String[] highlight(Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
* Highlights the top-N passages from a single field.
* @param field field name to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages used to
* form the highlighted snippets.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, maxPassages);
return res.get(field);
* Highlights the top passages from multiple fields.
* <p>
* Conceptually, this behaves as a more efficent form of:
* <pre class="prettyprint">
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, searcher, topDocs));
* }
* return m;
* </pre>
* @param fields field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
return highlightFields(fields, query, searcher, topDocs, 1);
* Highlights the top-N passages from multiple fields.
* <p>
* Conceptually, this behaves as a more efficient form of:
* <pre class="prettyprint">
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
* }
* return m;
* </pre>
* @param fields field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
final IndexReader reader = searcher.getIndexReader();
final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
query = rewrite(query);
SortedSet<Term> terms = new TreeSet<Term>();
terms = terms.subSet(floor, ceiling);
Term termTexts[] = terms.toArray(new Term[terms.size()]);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
SortedSet<Term> queryTerms = new TreeSet<Term>();
int docids[] = new int[scoreDocs.length];
for (int i = 0; i < docids.length; i++) {
@ -140,21 +229,44 @@ public final class PostingsHighlighter {
IndexReaderContext readerContext = reader.getContext();
List<AtomicReaderContext> leaves = readerContext.leaves();
BreakIterator bi = (BreakIterator)breakIterator.clone();
// sort for sequential io
// pull stored data
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(field, maxLength);
String contents[] = new String[docids.length];
for (int i = 0; i < contents.length; i++) {
// pull stored data:
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
String contents[][] = new String[fields.length][docids.length];
for (int i = 0; i < docids.length; i++) {
reader.document(docids[i], visitor);
contents[i] = visitor.getValue();
for (int j = 0; j < fields.length; j++) {
contents[j][i] = visitor.getValue(j).toString();
BreakIterator bi = (BreakIterator)breakIterator.clone();
Map<String,String[]> highlights = new HashMap<String,String[]>();
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
String[] result = new String[scoreDocs.length];
for (int j = 0; j < scoreDocs.length; j++) {
result[j] = fieldHighlights.get(scoreDocs[j].doc);
highlights.put(field, result);
return highlights;
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
// reuse in the real sense... for docs in same segment we just advance our old enum
@ -178,9 +290,9 @@ public final class PostingsHighlighter {
if (leaf != lastLeaf) {
termsEnum = t.iterator(null);
postings = new DocsAndPositionsEnum[terms.size()];
postings = new DocsAndPositionsEnum[terms.length];
Passage passages[] = highlightDoc(termTexts, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length > 0) {
// otherwise a null snippet
highlights.put(doc, formatter.format(passages, content));
@ -188,17 +300,13 @@ public final class PostingsHighlighter {
lastLeaf = leaf;
String[] result = new String[scoreDocs.length];
for (int i = 0; i < scoreDocs.length; i++) {
result[i] = highlights.get(scoreDocs[i].doc);
return result;
return highlights;
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
private Passage[] highlightDoc(Term terms[], int contentLength, BreakIterator bi, int doc,
private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc,
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
float weights[] = new float[terms.length];
@ -381,17 +489,24 @@ public final class PostingsHighlighter {
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
private final String field;
private final String fields[];
private final int maxLength;
private final StringBuilder builder = new StringBuilder();
private final StringBuilder builders[];
private int currentField = -1;
public LimitedStoredFieldVisitor(String field, int maxLength) {
this.field = field;
public LimitedStoredFieldVisitor(String fields[], int maxLength) {
this.fields = fields;
this.maxLength = maxLength;
builders = new StringBuilder[fields.length];
for (int i = 0; i < builders.length; i++) {
builders[i] = new StringBuilder();
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
assert currentField >= 0;
StringBuilder builder = builders[currentField];
if (builder.length() > 0) {
builder.append(' '); // for the offset gap, TODO: make this configurable
@ -404,22 +519,24 @@ public final class PostingsHighlighter {
public Status needsField(FieldInfo fieldInfo) throws IOException {
if (fieldInfo.name.equals(field)) {
if (builder.length() > maxLength) {
return Status.STOP;
return Status.YES;
} else {
currentField = Arrays.binarySearch(fields, fieldInfo.name);
if (currentField < 0) {
return Status.NO;
} else if (builders[currentField].length() > maxLength) {
return fields.length == 1 ? Status.STOP : Status.NO;
return Status.YES;
String getValue() {
return builder.toString();
String getValue(int i) {
return builders[i].toString();
void reset() {
currentField = -1;
for (int i = 0; i < fields.length; i++) {
@ -17,6 +17,8 @@ package org.apache.lucene.sandbox.postingshighlight;
* limitations under the License.
import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -63,11 +65,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter("body");
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight(query, searcher, topDocs);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("Just a test <b>highlighting</b> from postings. ", snippets[0]);
assertEquals("<b>Highlighting</b> the first term. ", snippets[1]);
@ -99,11 +101,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter("body");
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight(query, searcher, topDocs);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
@ -112,6 +114,47 @@ public class TestPostingsHighlighter extends LuceneTestCase {
public void testMultipleFields() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
Field body = new Field("body", "", offsetsType);
Field title = new Field("title", "", offsetsType);
Document doc = new Document();
body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
title.setStringValue("I am hoping for the best.");
body.setStringValue("Highlighting the first term. Hope it works.");
title.setStringValue("But best may not be good enough.");
IndexReader ir = iw.getReader();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
Map<String,String[]> snippets = highlighter.highlightFields(new String [] { "body", "title" }, query, searcher, topDocs);
assertEquals(2, snippets.size());
assertEquals("Just a test <b>highlighting</b> from postings. ", snippets.get("body")[0]);
assertEquals("<b>Highlighting</b> the first term. ", snippets.get("body")[1]);
assertEquals("I am hoping for the <b>best</b>.", snippets.get("title")[0]);
assertEquals("But <b>best</b> may not be good enough.", snippets.get("title")[1]);
public void testMultipleTerms() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
@ -133,14 +176,14 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter("body");
PostingsHighlighter highlighter = new PostingsHighlighter();
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "just")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "first")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight(query, searcher, topDocs);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.length);
assertEquals("<b>Just</b> a test <b>highlighting</b> from postings. ", snippets[0]);
assertEquals("<b>Highlighting</b> the <b>first</b> term. ", snippets[1]);
@ -170,11 +213,11 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter("body");
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
String snippets[] = highlighter.highlight(query, searcher, topDocs, 2);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
assertEquals(2, snippets.length);
assertEquals("This is a <b>test</b>. Just a <b>test</b> highlighting from postings. ", snippets[0]);
assertEquals("This <b>test</b> is another <b>test</b>. ... <b>Test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[1]);
@ -204,12 +247,12 @@ public class TestPostingsHighlighter extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter("body");
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
try {
highlighter.highlight(query, searcher, topDocs, 2);
highlighter.highlight("body", query, searcher, topDocs, 2);
fail("did not hit expected exception");
} catch (IllegalArgumentException iae) {
// expected
@ -109,14 +109,12 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
for (int n = 1; n < maxTopN; n++) {
FakePassageFormatter f1 = new FakePassageFormatter();
PostingsHighlighter p1 = new PostingsHighlighter("body",
PostingsHighlighter p1 = new PostingsHighlighter(Integer.MAX_VALUE-1,
new PassageScorer(),
FakePassageFormatter f2 = new FakePassageFormatter();
PostingsHighlighter p2 = new PostingsHighlighter("body",
PostingsHighlighter p2 = new PostingsHighlighter(Integer.MAX_VALUE-1,
new PassageScorer(),
@ -124,8 +122,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
bq.add(query, BooleanClause.Occur.MUST);
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
TopDocs td = is.search(bq, 1);
p1.highlight(bq, is, td, n);
p2.highlight(bq, is, td, n+1);
p1.highlight("body", bq, is, td, n);
p2.highlight("body", bq, is, td, n+1);
Reference in New Issue
Block a user