diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fafce11d0e1..8fb3a0572d5 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -110,6 +110,9 @@ New Features
takes int[] docIDs instead of TopDocs. (Robert Muir, Mike
McCandless)
+* LUCENE-4856: If there are no matches for a given field, return the
+ first maxPassages sentences (Robert Muir, Mike McCandless)
+
API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
index 45f3fa33179..b06d943ea5d 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search.postingshighlight;
import java.io.IOException;
import java.text.BreakIterator;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
@@ -32,6 +33,7 @@ import java.util.TreeSet;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
@@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
@@ -142,7 +143,7 @@ public class PostingsHighlighter {
this.scorer = scorer;
this.formatter = formatter;
}
-
+
/**
* Highlights the top passages from a single field.
*
@@ -152,7 +153,8 @@ public class PostingsHighlighter {
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in topDocs
.
- * If no highlights were found for a document, its value is null
.
+ * If no highlights were found for a document, the
+ * first sentence for the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -172,7 +174,9 @@ public class PostingsHighlighter {
* @param maxPassages The maximum number of top-N ranked passages used to
* form the highlighted snippets.
* @return Array of formatted snippets corresponding to the documents in topDocs
.
- * If no highlights were found for a document, its value is null
.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -201,7 +205,8 @@ public class PostingsHighlighter {
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in topDocs
.
- * If no highlights were found for a document, its value is null
.
+ * If no highlights were found for a document, the
+ * first sentence from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -231,7 +236,9 @@ public class PostingsHighlighter {
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in topDocs
.
- * If no highlights were found for a document, its value is null
.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -259,7 +266,9 @@ public class PostingsHighlighter {
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in topDocs
.
- * If no highlights were found for a document, its value is null
.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -327,7 +336,7 @@ public class PostingsHighlighter {
DocsAndPositionsEnum postings[] = null;
TermsEnum termsEnum = null;
int lastLeaf = -1;
-
+
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
if (content.length() == 0) {
@@ -347,8 +356,12 @@ public class PostingsHighlighter {
postings = new DocsAndPositionsEnum[terms.length];
}
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ if (passages.length == 0) {
+ passages = getEmptyHighlight(field, bi, maxPassages);
+ }
if (passages.length > 0) {
- // otherwise a null snippet
+ // otherwise a null snippet (eg if field is missing
+ // entirely from the doc)
highlights.put(doc, formatter.format(passages, content));
}
lastLeaf = leaf;
@@ -476,7 +489,35 @@ public class PostingsHighlighter {
}
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
}
- return new Passage[0];
+
+ // Dead code but compiler disagrees:
+ assert false;
+ return null;
+ }
+
+ /** Called to summarize a document when no hits were
+ * found. By default this just returns the first
+ * {@code maxPassages} sentences; subclasses can override
+ * to customize. */
+ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+ // BreakIterator should be un-next'd:
+ List passages = new ArrayList();
+ int pos = bi.current();
+ assert pos == 0;
+ while (passages.size() < maxPassages) {
+ int next = bi.next();
+ if (next == BreakIterator.DONE) {
+ break;
+ }
+ Passage passage = new Passage();
+ passage.score = Float.NaN;
+ passage.startOffset = pos;
+ passage.endOffset = next;
+ passages.add(passage);
+ pos = next;
+ }
+
+ return passages.toArray(new Passage[passages.size()]);
}
private static class OffsetsEnum implements Comparable {
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
index 8290f03590e..a638ac72bfc 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@@ -20,6 +20,7 @@ package org.apache.lucene.search.postingshighlight;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.text.BreakIterator;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@@ -373,7 +374,6 @@ public class TestPostingsHighlighter extends LuceneTestCase {
assertEquals(1, snippets.length);
assertTrue(snippets[0].contains("Square"));
assertTrue(snippets[0].contains("Porter"));
- //System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
ir.close();
dir.close();
}
@@ -547,4 +547,205 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close();
dir.close();
}
+
+ /** Make sure highlighter returns first N sentences if
+ * there were no hits. */
+ public void testEmptyHighlights() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Document doc = new Document();
+
+ Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
+ doc.add(body);
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter();
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ int[] docIDs = new int[] {0};
+ String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+ assertEquals(1, snippets.length);
+ assertEquals("test this is. another sentence this test has. ", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+
+ /** Make sure highlighter we can customize how emtpy
+ * highlight is returned. */
+ public void testCustomEmptyHighlights() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Document doc = new Document();
+
+ Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
+ doc.add(body);
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+ return new Passage[0];
+ }
+ };
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ int[] docIDs = new int[] {0};
+ String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+ assertEquals(1, snippets.length);
+ assertNull(snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+
+ /** Make sure highlighter returns whole text when there
+ * are no hits and BreakIterator is null. */
+ public void testEmptyHighlightsWhole() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Document doc = new Document();
+
+ Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
+ doc.add(body);
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ int[] docIDs = new int[] {0};
+ String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+ assertEquals(1, snippets.length);
+ assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+
+ /** Make sure highlighter is OK with entirely missing
+ * field. */
+ public void testFieldIsMissing() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Document doc = new Document();
+
+ Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
+ doc.add(body);
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter();
+ Query query = new TermQuery(new Term("bogus", "highlighting"));
+ int[] docIDs = new int[] {0};
+ String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus");
+ assertEquals(1, snippets.length);
+ assertNull(snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testFieldIsJustSpace() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+ Document doc = new Document();
+ doc.add(new Field("body", " ", offsetsType));
+ doc.add(new Field("id", "id", offsetsType));
+ iw.addDocument(doc);
+
+ doc = new Document();
+ doc.add(new Field("body", "something", offsetsType));
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter();
+ int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ int[] docIDs = new int[1];
+ docIDs[0] = docID;
+ String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+ assertEquals(1, snippets.length);
+ assertEquals(" ", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testFieldIsEmptyString() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+ Document doc = new Document();
+ doc.add(new Field("body", "", offsetsType));
+ doc.add(new Field("id", "id", offsetsType));
+ iw.addDocument(doc);
+
+ doc = new Document();
+ doc.add(new Field("body", "something", offsetsType));
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter();
+ int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
+
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ int[] docIDs = new int[1];
+ docIDs[0] = docID;
+ String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
+ assertEquals(1, snippets.length);
+ assertNull(snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
}