LUCENE-4846: PostingsHighlighter allow customizing how the values to be highlighted are loaded (default is still stored fields)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458009 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-18 21:20:11 +00:00
parent 1f6a3f6a94
commit c5763b80ff
3 changed files with 68 additions and 10 deletions

View File

@ -102,6 +102,10 @@ New Features
* LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve * LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve
all children in each group. (Aleksey Aleev via Mike McCandless) all children in each group. (Aleksey Aleev via Mike McCandless)
* LUCENE-4846: PostingsHighlighter subclasses can override where the
String values come from (it still defaults to pulling from stored
fields). (Robert Muir, Mike McCandless)
API Changes API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use * LUCENE-4844: removed TaxonomyReader.getParent(), you should use

View File

@ -81,7 +81,7 @@ import org.apache.lucene.util.UnicodeUtil;
* This is thread-safe, and can be used across different readers. * This is thread-safe, and can be used across different readers.
* @lucene.experimental * @lucene.experimental
*/ */
public final class PostingsHighlighter { public class PostingsHighlighter {
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets, // TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
// but if the analyzer is really fast and the field is tiny, this might really be // but if the analyzer is really fast and the field is tiny, this might really be
@ -257,15 +257,7 @@ public final class PostingsHighlighter {
Arrays.sort(fields); Arrays.sort(fields);
// pull stored data: // pull stored data:
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength); String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
String contents[][] = new String[fields.length][docids.length];
for (int i = 0; i < docids.length; i++) {
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
contents[j][i] = visitor.getValue(j).toString();
}
visitor.reset();
}
Map<String,String[]> highlights = new HashMap<String,String[]>(); Map<String,String[]> highlights = new HashMap<String,String[]>();
for (int i = 0; i < fields.length; i++) { for (int i = 0; i < fields.length; i++) {
@ -285,6 +277,25 @@ public final class PostingsHighlighter {
} }
return highlights; return highlights;
} }
/** Loads the String values for each field X docID to be
* highlighted. By default this loads from stored
* fields, but a subclass can change the source. This
* method should allocate the String[fields.length][docids.length]
* and fill all values. The returned Strings must be
* identical to what was indexed. */
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
String contents[][] = new String[fields.length][docids.length];
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
for (int i = 0; i < docids.length; i++) {
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
contents[j][i] = visitor.getValue(j).toString();
}
visitor.reset();
}
return contents;
}
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException { private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>(); Map<Integer,String> highlights = new HashMap<Integer,String>();

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.postingshighlight;
*/ */
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.Map; import java.util.Map;
@ -465,4 +466,46 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close(); ir.close();
dir.close(); dir.close();
} }
public void testCustomFieldValueSource() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType offsetsType = new FieldType(TextField.TYPE_NOT_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
final String text = "This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test.";
Field body = new Field("body", text, offsetsType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) {
@Override
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
assert fields.length == 1;
assert docids.length == 1;
String[][] contents = new String[1][1];
contents[0][0] = text;
return contents;
}
};
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
assertEquals(1, snippets.length);
assertEquals("This is a <b>test</b>. Just highlighting from postings. This is also a much sillier <b>test</b>. Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]);
ir.close();
dir.close();
}
} }