LUCENE-4856: If there are no matches for a given field, return the first maxPassages sentences

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458865 13f79535-47bb-0310-9956-ffa450edef68
2013-03-20 15:02:05 +00:00 · 2013-03-20 15:02:05 +00:00 · 0e7f97b261
parent e33b01bc50
commit 0e7f97b261
3 changed files with 256 additions and 11 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -110,6 +110,9 @@ New Features
  takes int[] docIDs instead of TopDocs.  (Robert Muir, Mike
  McCandless)

+* LUCENE-4856: If there are no matches for a given field, return the
+  first maxPassages sentences (Robert Muir, Mike McCandless)
+
 API Changes

 * LUCENE-4844: removed TaxonomyReader.getParent(), you should use
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@ -19,6 +19,7 @@ package org.apache.lucene.search.postingshighlight;

 import java.io.IOException;
 import java.text.BreakIterator;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
@ -32,6 +33,7 @@ import java.util.TreeSet;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReaderContext;
@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
@ -142,7 +143,7 @@ public class PostingsHighlighter {
    this.scorer = scorer;
    this.formatter = formatter;
  }
-  
+
  /**
   * Highlights the top passages from a single field.
   * 
@ -152,7 +153,8 @@ public class PostingsHighlighter {
   * @param searcher searcher that was previously used to execute the query.
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first sentence for the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without 
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -172,7 +174,9 @@ public class PostingsHighlighter {
   * @param maxPassages The maximum number of top-N ranked passages used to 
   *        form the highlighted snippets.
   * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} sentences from the
+   *         field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without 
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -201,7 +205,8 @@ public class PostingsHighlighter {
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @return Map keyed on field name, containing the array of formatted snippets 
   *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first sentence from the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without 
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -231,7 +236,9 @@ public class PostingsHighlighter {
   *        form the highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets 
   *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} sentences from the
+   *         field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without 
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -259,7 +266,9 @@ public class PostingsHighlighter {
   *        form the highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets 
   *         corresponding to the documents in <code>topDocs</code>. 
-   *         If no highlights were found for a document, its value is <code>null</code>.
+   *         If no highlights were found for a document, the
+   *         first {@code maxPassages} from the field will
+   *         be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without 
   *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -327,7 +336,7 @@ public class PostingsHighlighter {
    DocsAndPositionsEnum postings[] = null;
    TermsEnum termsEnum = null;
    int lastLeaf = -1;
-    
+
    for (int i = 0; i < docids.length; i++) {
      String content = contents[i];
      if (content.length() == 0) {
@ -347,8 +356,12 @@ public class PostingsHighlighter {
        postings = new DocsAndPositionsEnum[terms.length];
      }
      Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+      if (passages.length == 0) {
+        passages = getEmptyHighlight(field, bi, maxPassages);
+      }
      if (passages.length > 0) {
-        // otherwise a null snippet
+        // otherwise a null snippet (eg if field is missing
+        // entirely from the doc)
        highlights.put(doc, formatter.format(passages, content));
      }
      lastLeaf = leaf;
@ -476,7 +489,35 @@ public class PostingsHighlighter {
      }
      current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
    }
-    return new Passage[0];
+
+    // Dead code but compiler disagrees:
+    assert false;
+    return null;
+  }
+
+  /** Called to summarize a document when no hits were
+   *  found.  By default this just returns the first
+   *  {@code maxPassages} sentences; subclasses can override
+   *  to customize. */
+  protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+    // BreakIterator should be un-next'd:
+    List<Passage> passages = new ArrayList<Passage>();
+    int pos = bi.current();
+    assert pos == 0;
+    while (passages.size() < maxPassages) {
+      int next = bi.next();
+      if (next == BreakIterator.DONE) {
+        break;
+      }
+      Passage passage = new Passage();
+      passage.score = Float.NaN;
+      passage.startOffset = pos;
+      passage.endOffset = next;
+      passages.add(passage);
+      pos = next;
+    }
+
+    return passages.toArray(new Passage[passages.size()]);
  }
  
  private static class OffsetsEnum implements Comparable<OffsetsEnum> {
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@ -20,6 +20,7 @@ package org.apache.lucene.search.postingshighlight;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.text.BreakIterator;
 import java.util.Map;

 import org.apache.lucene.analysis.Analyzer;
@ -373,7 +374,6 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    assertEquals(1, snippets.length);
    assertTrue(snippets[0].contains("<b>Square</b>"));
    assertTrue(snippets[0].contains("<b>Porter</b>"));
-    //System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
    ir.close();
    dir.close();
  }
@ -547,4 +547,205 @@ public class TestPostingsHighlighter extends LuceneTestCase {
    ir.close();
    dir.close();
  }
+
+  /** Make sure highlighter returns first N sentences if
+   *  there were no hits. */
+  public void testEmptyHighlights() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("test this is.  another sentence this test has.  ", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter we can customize how emtpy
+   *  highlight is returned. */
+  public void testCustomEmptyHighlights() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+        @Override
+        public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+          return new Passage[0];
+        }
+      };
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter returns whole text when there
+   *  are no hits and BreakIterator is null. */
+  public void testEmptyHighlightsWhole() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("test this is.  another sentence this test has.  far away is that planet.", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  /** Make sure highlighter is OK with entirely missing
+   *  field. */
+  public void testFieldIsMissing() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+
+    Field body = new Field("body", "test this is.  another sentence this test has.  far away is that planet.", offsetsType);
+    doc.add(body);
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    Query query = new TermQuery(new Term("bogus", "highlighting"));
+    int[] docIDs = new int[] {0};
+    String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFieldIsJustSpace() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+    Document doc = new Document();
+    doc.add(new Field("body", "   ", offsetsType));
+    doc.add(new Field("id", "id", offsetsType));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("body", "something", offsetsType));
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[1];
+    docIDs[0] = docID;
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertEquals("   ", snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
+
+  public void testFieldIsEmptyString() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+    Document doc = new Document();
+    doc.add(new Field("body", "", offsetsType));
+    doc.add(new Field("id", "id", offsetsType));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("body", "something", offsetsType));
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    int[] docIDs = new int[1];
+    docIDs[0] = docID;
+    String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+    assertEquals(1, snippets.length);
+    assertNull(snippets[0]);
+
+    ir.close();
+    dir.close();
+  }
 }