LUCENE-4826: fix PostingsHighlighter PassageQueue comparator so we keep the best 2 passages

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455693 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-12 20:09:36 +00:00
parent 092545884e
commit 7cc57853d6
4 changed files with 77 additions and 5 deletions

View File

@ -71,6 +71,9 @@ Bug Fixes
* LUCENE-4819: seekExact(BytesRef, boolean) did not work correctly with
Sorted[Set]DocValuesTermsEnum. (Robert Muir)
* LUCENE-4826: PostingsHighlighter was not returning the top N best
scoring passages. (Robert Muir, Mike McCandless)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -62,7 +62,7 @@ import org.apache.lucene.util.UnicodeUtil;
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
* <p>
* <b>WARNING</b>: The code is very new and may still have some exciting bugs!
* <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
* <p>
* Example usage:
* <pre class="prettyprint">
@ -361,10 +361,12 @@ public final class PostingsHighlighter {
PriorityQueue<Passage> passageQueue = new PriorityQueue<Passage>(n, new Comparator<Passage>() {
@Override
public int compare(Passage left, Passage right) {
if (right.score == left.score) {
return right.startOffset - left.endOffset;
if (left.score < right.score) {
return -1;
} else if (left.score > right.score) {
return 1;
} else {
return right.score > left.score ? 1 : -1;
return left.startOffset - right.startOffset;
}
}
});

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,8 @@ package org.apache.lucene.search.postingshighlight;
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@ -41,8 +43,8 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
public class TestPostingsHighlighter extends LuceneTestCase {
@ -340,4 +342,68 @@ public class TestPostingsHighlighter extends LuceneTestCase {
ir.close();
dir.close();
}
public void testCambridgeMA() throws Exception {
BufferedReader r = new BufferedReader(new InputStreamReader(
this.getClass().getResourceAsStream("CambridgeMA.utf8"), "UTF-8"));
String text = r.readLine();
r.close();
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer);
FieldType positionsType = new FieldType(TextField.TYPE_STORED);
positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", text, positionsType);
Document document = new Document();
document.add(body);
iw.addDocument(document);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("body", "porter")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "square")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "massachusetts")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits);
PostingsHighlighter highlighter = new PostingsHighlighter(Integer.MAX_VALUE-1);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
assertEquals(1, snippets.length);
assertTrue(snippets[0].contains("<b>Square</b>"));
assertTrue(snippets[0].contains("<b>Porter</b>"));
//System.out.println("GOT: " + snippets.length + "; " + Arrays.toString(snippets));
ir.close();
dir.close();
}
public void testPassageRanking() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter();
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
assertEquals(1, snippets.length);
assertEquals("This is a <b>test</b>. ... Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]);
ir.close();
dir.close();
}
}