LUCENE-4846: PostingsHighlighter allow customizing how the values to be highlighted are loaded (default is still stored fields)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458009 13f79535-47bb-0310-9956-ffa450edef68
2013-03-18 21:20:11 +00:00 · 2013-03-18 21:20:11 +00:00 · c5763b80ff
parent 1f6a3f6a94
commit c5763b80ff
3 changed files with 68 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -102,6 +102,10 @@ New Features
 * LUCENE-4832: Add ToParentBlockJoinCollector.getTopGroupsWithAllChildDocs, to retrieve
  all children in each group.  (Aleksey Aleev via Mike McCandless)

+* LUCENE-4846: PostingsHighlighter subclasses can override where the
+  String values come from (it still defaults to pulling from stored
+  fields).  (Robert Muir, Mike McCandless)
+
 API Changes

 * LUCENE-4844: removed TaxonomyReader.getParent(), you should use
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@ -81,7 +81,7 @@ import org.apache.lucene.util.UnicodeUtil;
 * This is thread-safe, and can be used across different readers.
 * @lucene.experimental
 */
-public final class PostingsHighlighter {
+public class PostingsHighlighter {
  
  // TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
  // but if the analyzer is really fast and the field is tiny, this might really be
@ -257,15 +257,7 @@ public final class PostingsHighlighter {
    Arrays.sort(fields);
    
    // pull stored data:
-    LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
-    String contents[][] = new String[fields.length][docids.length];
-    for (int i = 0; i < docids.length; i++) {
-      searcher.doc(docids[i], visitor);
-      for (int j = 0; j < fields.length; j++) {
-        contents[j][i] = visitor.getValue(j).toString();
-      }
-      visitor.reset();
-    }
+    String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
    
    Map<String,String[]> highlights = new HashMap<String,String[]>();
    for (int i = 0; i < fields.length; i++) {
@ -285,6 +277,25 @@ public final class PostingsHighlighter {
    }
    return highlights;
  }
+
+  /** Loads the String values for each field X docID to be
+   *  highlighted.  By default this loads from stored
+   *  fields, but a subclass can change the source.  This
+   *  method should allocate the String[fields.length][docids.length]
+   *  and fill all values.  The returned Strings must be
+   *  identical to what was indexed. */
+  protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+    String contents[][] = new String[fields.length][docids.length];
+    LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
+    for (int i = 0; i < docids.length; i++) {
+      searcher.doc(docids[i], visitor);
+      for (int j = 0; j < fields.length; j++) {
+        contents[j][i] = visitor.getValue(j).toString();
+      }
+      visitor.reset();
+    }
+    return contents;
+  }
    
  private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
    Map<Integer,String> highlights = new HashMap<Integer,String>();
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@ -18,6 +18,7 @@ package org.apache.lucene.search.postingshighlight;
 */

 import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.Map;

@ -465,4 +466,46 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    ir.close();
    dir.close();
  }
+
+  public void testCustomFieldValueSource() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    Document doc = new Document();
+
+    FieldType offsetsType = new FieldType(TextField.TYPE_NOT_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    final String text = "This is a test.  Just highlighting from postings. This is also a much sillier test.  Feel free to test test test test test test test.";
+    Field body = new Field("body", text, offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()) {
+        @Override
+        protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+          assert fields.length == 1;
+          assert docids.length == 1;
+          String[][] contents = new String[1][1];
+          contents[0][0] = text;
+          return contents;
+        }
+      };
+
+    Query query = new TermQuery(new Term("body", "test"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
+    assertEquals(1, snippets.length);
+    assertEquals("This is a <b>test</b>.  Just highlighting from postings. This is also a much sillier <b>test</b>.  Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
 }