mirror of https://github.com/apache/lucene.git
LUCENE-4856: If there are no matches for a given field, return the first maxPassages sentences
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458865 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e33b01bc50
commit
0e7f97b261
|
@ -110,6 +110,9 @@ New Features
|
|||
takes int[] docIDs instead of TopDocs. (Robert Muir, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-4856: If there are no matches for a given field, return the
|
||||
first maxPassages sentences (Robert Muir, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search.postingshighlight;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
@ -32,6 +33,7 @@ import java.util.TreeSet;
|
|||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
|
@ -41,7 +43,6 @@ import org.apache.lucene.index.StoredFieldVisitor;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -142,7 +143,7 @@ public class PostingsHighlighter {
|
|||
this.scorer = scorer;
|
||||
this.formatter = formatter;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Highlights the top passages from a single field.
|
||||
*
|
||||
|
@ -152,7 +153,8 @@ public class PostingsHighlighter {
|
|||
* @param searcher searcher that was previously used to execute the query.
|
||||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first sentence for the field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
|
@ -172,7 +174,9 @@ public class PostingsHighlighter {
|
|||
* @param maxPassages The maximum number of top-N ranked passages used to
|
||||
* form the highlighted snippets.
|
||||
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} sentences from the
|
||||
* field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
|
@ -201,7 +205,8 @@ public class PostingsHighlighter {
|
|||
* @param topDocs TopDocs containing the summary result documents to highlight.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first sentence from the field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
|
@ -231,7 +236,9 @@ public class PostingsHighlighter {
|
|||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} sentences from the
|
||||
* field will be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
|
@ -259,7 +266,9 @@ public class PostingsHighlighter {
|
|||
* form the highlighted snippets.
|
||||
* @return Map keyed on field name, containing the array of formatted snippets
|
||||
* corresponding to the documents in <code>topDocs</code>.
|
||||
* If no highlights were found for a document, its value is <code>null</code>.
|
||||
* If no highlights were found for a document, the
|
||||
* first {@code maxPassages} from the field will
|
||||
* be returned.
|
||||
* @throws IOException if an I/O error occurred during processing
|
||||
* @throws IllegalArgumentException if <code>field</code> was indexed without
|
||||
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
|
||||
|
@ -327,7 +336,7 @@ public class PostingsHighlighter {
|
|||
DocsAndPositionsEnum postings[] = null;
|
||||
TermsEnum termsEnum = null;
|
||||
int lastLeaf = -1;
|
||||
|
||||
|
||||
for (int i = 0; i < docids.length; i++) {
|
||||
String content = contents[i];
|
||||
if (content.length() == 0) {
|
||||
|
@ -347,8 +356,12 @@ public class PostingsHighlighter {
|
|||
postings = new DocsAndPositionsEnum[terms.length];
|
||||
}
|
||||
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
|
||||
if (passages.length == 0) {
|
||||
passages = getEmptyHighlight(field, bi, maxPassages);
|
||||
}
|
||||
if (passages.length > 0) {
|
||||
// otherwise a null snippet
|
||||
// otherwise a null snippet (eg if field is missing
|
||||
// entirely from the doc)
|
||||
highlights.put(doc, formatter.format(passages, content));
|
||||
}
|
||||
lastLeaf = leaf;
|
||||
|
@ -476,7 +489,35 @@ public class PostingsHighlighter {
|
|||
}
|
||||
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
|
||||
}
|
||||
return new Passage[0];
|
||||
|
||||
// Dead code but compiler disagrees:
|
||||
assert false;
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Called to summarize a document when no hits were
|
||||
* found. By default this just returns the first
|
||||
* {@code maxPassages} sentences; subclasses can override
|
||||
* to customize. */
|
||||
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
|
||||
// BreakIterator should be un-next'd:
|
||||
List<Passage> passages = new ArrayList<Passage>();
|
||||
int pos = bi.current();
|
||||
assert pos == 0;
|
||||
while (passages.size() < maxPassages) {
|
||||
int next = bi.next();
|
||||
if (next == BreakIterator.DONE) {
|
||||
break;
|
||||
}
|
||||
Passage passage = new Passage();
|
||||
passage.score = Float.NaN;
|
||||
passage.startOffset = pos;
|
||||
passage.endOffset = next;
|
||||
passages.add(passage);
|
||||
pos = next;
|
||||
}
|
||||
|
||||
return passages.toArray(new Passage[passages.size()]);
|
||||
}
|
||||
|
||||
private static class OffsetsEnum implements Comparable<OffsetsEnum> {
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search.postingshighlight;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -373,7 +374,6 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
assertEquals(1, snippets.length);
|
||||
assertTrue(snippets[0].contains("<b>Square</b>"));
|
||||
assertTrue(snippets[0].contains("<b>Porter</b>"));
|
||||
//System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
@ -547,4 +547,205 @@ public class TestPostingsHighlighter extends LuceneTestCase {
|
|||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Make sure highlighter returns first N sentences if
|
||||
* there were no hits. */
|
||||
public void testEmptyHighlights() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Document doc = new Document();
|
||||
|
||||
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
|
||||
doc.add(body);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[] {0};
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
assertEquals(1, snippets.length);
|
||||
assertEquals("test this is. another sentence this test has. ", snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Make sure highlighter we can customize how emtpy
|
||||
* highlight is returned. */
|
||||
public void testCustomEmptyHighlights() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Document doc = new Document();
|
||||
|
||||
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
|
||||
doc.add(body);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter() {
|
||||
@Override
|
||||
public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
|
||||
return new Passage[0];
|
||||
}
|
||||
};
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[] {0};
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
assertEquals(1, snippets.length);
|
||||
assertNull(snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Make sure highlighter returns whole text when there
|
||||
* are no hits and BreakIterator is null. */
|
||||
public void testEmptyHighlightsWhole() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Document doc = new Document();
|
||||
|
||||
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
|
||||
doc.add(body);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000, null, new PassageScorer(), new PassageFormatter());
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[] {0};
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
assertEquals(1, snippets.length);
|
||||
assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Make sure highlighter is OK with entirely missing
|
||||
* field. */
|
||||
public void testFieldIsMissing() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Document doc = new Document();
|
||||
|
||||
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType);
|
||||
doc.add(body);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
Query query = new TermQuery(new Term("bogus", "highlighting"));
|
||||
int[] docIDs = new int[] {0};
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, 2).get("bogus");
|
||||
assertEquals(1, snippets.length);
|
||||
assertNull(snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFieldIsJustSpace() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("body", " ", offsetsType));
|
||||
doc.add(new Field("id", "id", offsetsType));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("body", "something", offsetsType));
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
|
||||
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[1];
|
||||
docIDs[0] = docID;
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
assertEquals(1, snippets.length);
|
||||
assertEquals(" ", snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFieldIsEmptyString() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("body", "", offsetsType));
|
||||
doc.add(new Field("id", "id", offsetsType));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("body", "something", offsetsType));
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter();
|
||||
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
|
||||
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
int[] docIDs = new int[1];
|
||||
docIDs[0] = docID;
|
||||
String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, 2).get("body");
|
||||
assertEquals(1, snippets.length);
|
||||
assertNull(snippets[0]);
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue