LUCENE-4856: If there are no matches for a given field, return the first maxPassages sentences

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458865 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-20 15:02:05 +00:00
parent e33b01bc50
commit 0e7f97b261
3 changed files with 256 additions and 11 deletions

View File

@ -110,6 +110,9 @@ New Features
takes int[] docIDs instead of TopDocs. (Robert Muir, Mike takes int[] docIDs instead of TopDocs. (Robert Muir, Mike
McCandless) McCandless)
* LUCENE-4856: If there are no matches for a given field, return the
first maxPassages sentences (Robert Muir, Mike McCandless)
API Changes API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use * LUCENE-4844: removed TaxonomyReader.getParent(), you should use

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search.postingshighlight;
import java.io.IOException; import java.io.IOException;
import java.text.BreakIterator; import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
@ -32,6 +33,7 @@ import java.util.TreeSet;
import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexReaderContext;
@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
@ -152,7 +153,8 @@ public class PostingsHighlighter {
* @param searcher searcher that was previously used to execute the query. * @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight. * @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>. * If no highlights were found for a document, the
* first sentence for the field will be returned.
* @throws IOException if an I/O error occurred during processing * @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without * @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -172,7 +174,9 @@ public class PostingsHighlighter {
* @param maxPassages The maximum number of top-N ranked passages used to * @param maxPassages The maximum number of top-N ranked passages used to
* form the highlighted snippets. * form the highlighted snippets.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>. * If no highlights were found for a document, the
* first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing * @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without * @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -201,7 +205,8 @@ public class PostingsHighlighter {
* @param topDocs TopDocs containing the summary result documents to highlight. * @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets * @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>. * corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>. * If no highlights were found for a document, the
* first sentence from the field will be returned.
* @throws IOException if an I/O error occurred during processing * @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without * @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -231,7 +236,9 @@ public class PostingsHighlighter {
* form the highlighted snippets. * form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets * @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>. * corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>. * If no highlights were found for a document, the
* first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing * @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without * @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -259,7 +266,9 @@ public class PostingsHighlighter {
* form the highlighted snippets. * form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets * @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>. * corresponding to the documents in <code>topDocs</code>.
* If no highlights were found for a document, its value is <code>null</code>. * If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
* @throws IOException if an I/O error occurred during processing * @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without * @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@ -347,8 +356,12 @@ public class PostingsHighlighter {
postings = new DocsAndPositionsEnum[terms.length]; postings = new DocsAndPositionsEnum[terms.length];
} }
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length == 0) {
passages = getEmptyHighlight(field, bi, maxPassages);
}
if (passages.length > 0) { if (passages.length > 0) {
// otherwise a null snippet // otherwise a null snippet (eg if field is missing
// entirely from the doc)
highlights.put(doc, formatter.format(passages, content)); highlights.put(doc, formatter.format(passages, content));
} }
lastLeaf = leaf; lastLeaf = leaf;
@ -476,7 +489,35 @@ public class PostingsHighlighter {
} }
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
} }
return new Passage[0];
// Dead code but compiler disagrees:
assert false;
return null;
}
/** Called to summarize a document when no hits were
* found. By default this just returns the first
* {@code maxPassages} sentences; subclasses can override
* to customize. */
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
// BreakIterator should be un-next'd:
List<Passage> passages = new ArrayList<Passage>();
int pos = bi.current();
assert pos == 0;
while (passages.size() < maxPassages) {
int next = bi.next();
if (next == BreakIterator.DONE) {
break;
}
Passage passage = new Passage();
passage.score = Float.NaN;
passage.startOffset = pos;
passage.endOffset = next;
passages.add(passage);
pos = next;
}
return passages.toArray(new Passage[passages.size()]);
} }
private static class OffsetsEnum implements Comparable<OffsetsEnum> { private static class OffsetsEnum implements Comparable<OffsetsEnum> {

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.postingshighlight;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.text.BreakIterator;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
@ -373,7 +374,6 @@ public class TestPostingsHighlighter extends LuceneTestCase {
assertEquals(1, snippets.length); assertEquals(1, snippets.length);
assertTrue(snippets[0].contains("<b>Square</b>")); assertTrue(snippets[0].contains("<b>Square</b>"));
assertTrue(snippets[0].contains("<b>Porter</b>")); assertTrue(snippets[0].contains("<b>Porter</b>"));
//System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
ir.close(); ir.close();
dir.close(); dir.close();
} }
@ -547,4 +547,205 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close(); ir.close();
dir.close(); dir.close();
} }
/** Make sure highlighter returns first N sentences if
* there were no hits. */
public void testEmptyHighlights() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[] {0};
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
assertEquals(1, snippets.length);
assertEquals("test this is. another sentence this test has. ", snippets[0]);
ir.close();
dir.close();
}
/** Make sure highlighter we can customize how emtpy
* highlight is returned. */
public void testCustomEmptyHighlights() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter() {
@Override
public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
return new Passage[0];
}
};
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[] {0};
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
assertEquals(1, snippets.length);
assertNull(snippets[0]);
ir.close();
dir.close();
}
/** Make sure highlighter returns whole text when there
* are no hits and BreakIterator is null. */
public void testEmptyHighlightsWhole() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[] {0};
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
assertEquals(1, snippets.length);
assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]);
ir.close();
dir.close();
}
/** Make sure highlighter is OK with entirely missing
* field. */
public void testFieldIsMissing() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("bogus", "highlighting"));
int[] docIDs = new int[] {0};
String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus");
assertEquals(1, snippets.length);
assertNull(snippets[0]);
ir.close();
dir.close();
}
public void testFieldIsJustSpace() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
doc.add(new Field("body", " ", offsetsType));
doc.add(new Field("id", "id", offsetsType));
iw.addDocument(doc);
doc = new Document();
doc.add(new Field("body", "something", offsetsType));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[1];
docIDs[0] = docID;
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
assertEquals(1, snippets.length);
assertEquals(" ", snippets[0]);
ir.close();
dir.close();
}
public void testFieldIsEmptyString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Document doc = new Document();
doc.add(new Field("body", "", offsetsType));
doc.add(new Field("id", "id", offsetsType));
iw.addDocument(doc);
doc = new Document();
doc.add(new Field("body", "something", offsetsType));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[1];
docIDs[0] = docID;
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
assertEquals(1, snippets.length);
assertNull(snippets[0]);
ir.close();
dir.close();
}
} }