LUCENE-4906: PostingsHighlighter: add expert API to render highlights to Object

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522619 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-09-12 15:34:55 +00:00
parent 980411d93d
commit c3ccfbe8d1
4 changed files with 108 additions and 10 deletions

View File

@ -47,6 +47,11 @@ Optimizations
======================= Lucene 4.6.0 =======================
New Features
* LUCENE-4906: PostingsHighlighter can now render to custom Object,
for advanced use cases where String is too restrictive (Luca
Cavanna, Robert Muir, Mike McCandless)
======================= Lucene 4.5.0 =======================

View File

@ -31,8 +31,11 @@ public abstract class PassageFormatter {
* @param passages top-N passages for the field. Note these are sorted in
* the order that they appear in the document for convenience.
* @param content content for the field.
* @return formatted highlight
* @return formatted highlight. Note that for the
* non-expert APIs in {@link PostingsHighlighter} that
* return String, the toString method on the Object
* returned by this method is used to compute the string.
*/
public abstract String format(Passage passages[], String content);
public abstract Object format(Passage passages[], String content);
}

View File

@ -267,7 +267,7 @@ public class PostingsHighlighter {
return highlightFields(fields, query, searcher, docids, maxPassages);
}
/**
* Highlights the top-N passages from multiple fields,
* for the provided int[] docids.
@ -280,7 +280,7 @@ public class PostingsHighlighter {
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
* corresponding to the documents in <code>docidsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
@ -289,6 +289,45 @@ public class PostingsHighlighter {
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
Map<String,String[]> snippets = new HashMap<String,String[]>();
for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
Object[] snippetObjects = ent.getValue();
String[] snippetStrings = new String[snippetObjects.length];
snippets.put(ent.getKey(), snippetStrings);
for(int i=0;i<snippetObjects.length;i++) {
Object snippet = snippetObjects[i];
if (snippet != null) {
snippetStrings[i] = snippet.toString();
}
}
}
return snippets;
}
/**
* Expert: highlights the top-N passages from multiple fields,
* for the provided int[] docids, to custom Object as
* returned by the {@link PassageFormatter}. Use
* this API to render to something other than String.
*
* @param fieldsIn field names to highlight.
* Must have a stored string value and also be indexed with offsets.
* @param query query to highlight.
* @param searcher searcher that was previously used to execute the query.
* @param docidsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>docidsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
@ -335,7 +374,7 @@ public class PostingsHighlighter {
// pull stored data:
String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
Map<String,String[]> highlights = new HashMap<String,String[]>();
Map<String,Object[]> highlights = new HashMap<String,Object[]>();
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
int numPassages = maxPassages[i];
@ -350,9 +389,9 @@ public class PostingsHighlighter {
for(Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
}
Map<Integer,String> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
String[] result = new String[docids.length];
Object[] result = new Object[docids.length];
for (int j = 0; j < docidsIn.length; j++) {
result[j] = fieldHighlights.get(docidsIn[j]);
}
@ -394,8 +433,8 @@ public class PostingsHighlighter {
return ' ';
}
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,Object> highlights = new HashMap<Integer,Object>();
// reuse in the real sense... for docs in same segment we just advance our old enum
DocsAndPositionsEnum postings[] = null;

View File

@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@ -47,8 +48,8 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
public class TestPostingsHighlighter extends LuceneTestCase {
@ -1068,4 +1069,54 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close();
dir.close();
}
// LUCENE-4906
public void testObjectFormatter() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
protected PassageFormatter getFormatter(String field) {
return new PassageFormatter() {
PassageFormatter defaultFormatter = new DefaultPassageFormatter();
@Override
public String[] format(Passage passages[], String content) {
// Just turns the String snippet into a length 2
// array of String
return new String[] {"blah blah", defaultFormatter.format(passages, content).toString()};
}
};
}
};
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
int[] docIDs = new int[1];
docIDs[0] = topDocs.scoreDocs[0].doc;
Map<String,Object[]> snippets = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docIDs, new int[] {1});
Object[] bodySnippets = snippets.get("body");
assertEquals(1, bodySnippets.length);
assertTrue(Arrays.equals(new String[] {"blah blah", "Just a test <b>highlighting</b> from postings. "}, (String[]) bodySnippets[0]));
ir.close();
dir.close();
}
}