LUCENE-4906: PostingsHighlighter: add expert API to render highlights to Object

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522619 13f79535-47bb-0310-9956-ffa450edef68
2013-09-12 15:34:55 +00:00 · 2013-09-12 15:34:55 +00:00 · c3ccfbe8d1
parent 980411d93d
commit c3ccfbe8d1
4 changed files with 108 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -47,6 +47,11 @@ Optimizations

 ======================= Lucene 4.6.0 =======================

+New Features
+
+* LUCENE-4906: PostingsHighlighter can now render to custom Object,
+  for advanced use cases where String is too restrictive (Luca
+  Cavanna, Robert Muir, Mike McCandless)

 ======================= Lucene 4.5.0 =======================

--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
@ -31,8 +31,11 @@ public abstract class PassageFormatter {
   * @param passages top-N passages for the field. Note these are sorted in
   *        the order that they appear in the document for convenience.
   * @param content content for the field.
-   * @return formatted highlight
+   * @return formatted highlight.  Note that for the
+   * non-expert APIs in {@link PostingsHighlighter} that
+   * return String, the toString method on the Object
+   * returned by this method is used to compute the string.
   */
-  public abstract String format(Passage passages[], String content);
+  public abstract Object format(Passage passages[], String content);

 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@ -267,7 +267,7 @@ public class PostingsHighlighter {

    return highlightFields(fields, query, searcher, docids, maxPassages);
  }
-    
+
  /**
   * Highlights the top-N passages from multiple fields,
   * for the provided int[] docids.
@ -280,7 +280,7 @@ public class PostingsHighlighter {
   * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to 
   *        form the highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets 
-   *         corresponding to the documents in <code>topDocs</code>. 
+   *         corresponding to the documents in <code>docidsIn</code>. 
   *         If no highlights were found for a document, the
   *         first {@code maxPassages} from the field will
   *         be returned.
@ -289,6 +289,45 @@ public class PostingsHighlighter {
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
+    Map<String,String[]> snippets = new HashMap<String,String[]>();
+    for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
+      Object[] snippetObjects = ent.getValue();
+      String[] snippetStrings = new String[snippetObjects.length];
+      snippets.put(ent.getKey(), snippetStrings);
+      for(int i=0;i<snippetObjects.length;i++) {
+        Object snippet = snippetObjects[i];
+        if (snippet != null) {
+          snippetStrings[i] = snippet.toString();
+        }
+      }
+    }
+
+    return snippets;
+  }
+
+  /**
+   * Expert: highlights the top-N passages from multiple fields,
+   * for the provided int[] docids, to custom Object as
+   * returned by the {@link PassageFormatter}.  Use
+   * this API to render to something other than String.
+   * 
+   * @param fieldsIn field names to highlight. 
+   *        Must have a stored string value and also be indexed with offsets.
+   * @param query query to highlight.
+   * @param searcher searcher that was previously used to execute the query.
+   * @param docidsIn containing the document IDs to highlight.
+   * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to 
+   *        form the highlighted snippets.
+   * @return Map keyed on field name, containing the array of formatted snippets 
+   *         corresponding to the documents in <code>docidsIn</code>. 
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} from the field will
+   *         be returned.
+   * @throws IOException if an I/O error occurred during processing
+   * @throws IllegalArgumentException if <code>field</code> was indexed without 
+   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+   */
+  public Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
    if (fieldsIn.length < 1) {
      throw new IllegalArgumentException("fieldsIn must not be empty");
    }
@ -335,7 +374,7 @@ public class PostingsHighlighter {
    // pull stored data:
    String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
    
-    Map<String,String[]> highlights = new HashMap<String,String[]>();
+    Map<String,Object[]> highlights = new HashMap<String,Object[]>();
    for (int i = 0; i < fields.length; i++) {
      String field = fields[i];
      int numPassages = maxPassages[i];
@ -350,9 +389,9 @@ public class PostingsHighlighter {
      for(Term term : fieldTerms) {
        terms[termUpto++] = term.bytes();
      }
-      Map<Integer,String> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
+      Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
        
-      String[] result = new String[docids.length];
+      Object[] result = new Object[docids.length];
      for (int j = 0; j < docidsIn.length; j++) {
        result[j] = fieldHighlights.get(docidsIn[j]);
      }
@ -394,8 +433,8 @@ public class PostingsHighlighter {
    return ' ';
  }
    
-  private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
-    Map<Integer,String> highlights = new HashMap<Integer,String>();
+  private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
+    Map<Integer,Object> highlights = new HashMap<Integer,Object>();
    
    // reuse in the real sense... for docs in same segment we just advance our old enum
    DocsAndPositionsEnum postings[] = null;
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@ -21,6 +21,7 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.text.BreakIterator;
+import java.util.Arrays;
 import java.util.Map;

 import org.apache.lucene.analysis.Analyzer;
@ -47,8 +48,8 @@ import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase;

@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
 public class TestPostingsHighlighter extends LuceneTestCase {
@ -1068,4 +1069,54 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    ir.close();
    dir.close();
  }
+
+  // LUCENE-4906
+  public void testObjectFormatter() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+      @Override
+      protected PassageFormatter getFormatter(String field) {
+        return new PassageFormatter() {
+          PassageFormatter defaultFormatter = new DefaultPassageFormatter();
+
+          @Override
+          public String[] format(Passage passages[], String content) {
+            // Just turns the String snippet into a length 2
+            // array of String
+            return new String[] {"blah blah", defaultFormatter.format(passages, content).toString()};
+          }
+        };
+      }
+    };
+
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    int[] docIDs = new int[1];
+    docIDs[0] = topDocs.scoreDocs[0].doc;
+    Map<String,Object[]> snippets = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docIDs, new int[] {1});
+    Object[] bodySnippets = snippets.get("body");
+    assertEquals(1, bodySnippets.length);
+    assertTrue(Arrays.equals(new String[] {"blah blah", "Just a test <b>highlighting</b> from postings. "}, (String[]) bodySnippets[0]));
+    
+    ir.close();
+    dir.close();
+  }
 }