diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fafce11d0e1..8fb3a0572d5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -110,6 +110,9 @@ New Features takes int[] docIDs instead of TopDocs. (Robert Muir, Mike McCandless) +* LUCENE-4856: If there are no matches for a given field, return the + first maxPassages sentences (Robert Muir, Mike McCandless) + API Changes * LUCENE-4844: removed TaxonomyReader.getParent(), you should use diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java index 45f3fa33179..b06d943ea5d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search.postingshighlight; import java.io.IOException; import java.text.BreakIterator; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; @@ -32,6 +33,7 @@ import java.util.TreeSet; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; @@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -142,7 +143,7 @@ public class PostingsHighlighter { this.scorer = scorer; this.formatter = formatter; } - + /** * Highlights the top passages from a single field. * @@ -152,7 +153,8 @@ public class PostingsHighlighter { * @param searcher searcher that was previously used to execute the query. * @param topDocs TopDocs containing the summary result documents to highlight. * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first sentence for the field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -172,7 +174,9 @@ public class PostingsHighlighter { * @param maxPassages The maximum number of top-N ranked passages used to * form the highlighted snippets. * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -201,7 +205,8 @@ public class PostingsHighlighter { * @param topDocs TopDocs containing the summary result documents to highlight. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first sentence from the field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -231,7 +236,9 @@ public class PostingsHighlighter { * form the highlighted snippets. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} sentences from the + * field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -259,7 +266,9 @@ public class PostingsHighlighter { * form the highlighted snippets. * @return Map keyed on field name, containing the array of formatted snippets * corresponding to the documents in topDocs. - * If no highlights were found for a document, its value is null. + * If no highlights were found for a document, the + * first {@code maxPassages} from the field will + * be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if field was indexed without * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} @@ -327,7 +336,7 @@ public class PostingsHighlighter { DocsAndPositionsEnum postings[] = null; TermsEnum termsEnum = null; int lastLeaf = -1; - + for (int i = 0; i < docids.length; i++) { String content = contents[i]; if (content.length() == 0) { @@ -347,8 +356,12 @@ public class PostingsHighlighter { postings = new DocsAndPositionsEnum[terms.length]; } Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + if (passages.length == 0) { + passages = getEmptyHighlight(field, bi, maxPassages); + } if (passages.length > 0) { - // otherwise a null snippet + // otherwise a null snippet (eg if field is missing + // entirely from the doc) highlights.put(doc, formatter.format(passages, content)); } lastLeaf = leaf; @@ -476,7 +489,35 @@ public class PostingsHighlighter { } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } - return new Passage[0]; + + // Dead code but compiler disagrees: + assert false; + return null; + } + + /** Called to summarize a document when no hits were + * found. By default this just returns the first + * {@code maxPassages} sentences; subclasses can override + * to customize. */ + protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + // BreakIterator should be un-next'd: + List passages = new ArrayList(); + int pos = bi.current(); + assert pos == 0; + while (passages.size() < maxPassages) { + int next = bi.next(); + if (next == BreakIterator.DONE) { + break; + } + Passage passage = new Passage(); + passage.score = Float.NaN; + passage.startOffset = pos; + passage.endOffset = next; + passages.add(passage); + pos = next; + } + + return passages.toArray(new Passage[passages.size()]); } private static class OffsetsEnum implements Comparable { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java index 8290f03590e..a638ac72bfc 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.postingshighlight; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.text.BreakIterator; import java.util.Map; import org.apache.lucene.analysis.Analyzer; @@ -373,7 +374,6 @@ public class TestPostingsHighlighter extends LuceneTestCase { assertEquals(1, snippets.length); assertTrue(snippets[0].contains("Square")); assertTrue(snippets[0].contains("Porter")); - //System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets)); ir.close(); dir.close(); } @@ -547,4 +547,205 @@ public class TestPostingsHighlighter extends LuceneTestCase { ir.close(); dir.close(); } + + /** Make sure highlighter returns first N sentences if + * there were no hits. */ + public void testEmptyHighlights() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals("test this is. another sentence this test has. ", snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter we can customize how emtpy + * highlight is returned. */ + public void testCustomEmptyHighlights() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter() { + @Override + public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + return new Passage[0]; + } + }; + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter returns whole text when there + * are no hits and BreakIterator is null. */ + public void testEmptyHighlightsWhole() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter()); + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]); + + ir.close(); + dir.close(); + } + + /** Make sure highlighter is OK with entirely missing + * field. */ + public void testFieldIsMissing() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Document doc = new Document(); + + Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); + doc.add(body); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + Query query = new TermQuery(new Term("bogus", "highlighting")); + int[] docIDs = new int[] {0}; + String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } + + public void testFieldIsJustSpace() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + Document doc = new Document(); + doc.add(new Field("body", " ", offsetsType)); + doc.add(new Field("id", "id", offsetsType)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("body", "something", offsetsType)); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[1]; + docIDs[0] = docID; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertEquals(" ", snippets[0]); + + ir.close(); + dir.close(); + } + + public void testFieldIsEmptyString() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + + Document doc = new Document(); + doc.add(new Field("body", "", offsetsType)); + doc.add(new Field("id", "id", offsetsType)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("body", "something", offsetsType)); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + Query query = new TermQuery(new Term("body", "highlighting")); + int[] docIDs = new int[1]; + docIDs[0] = docID; + String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body"); + assertEquals(1, snippets.length); + assertNull(snippets[0]); + + ir.close(); + dir.close(); + } }