From 53fd6a8919ba84ba27f140b5532af61f82f744db Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 14 Mar 2013 18:25:18 +0000 Subject: [PATCH] LUCENE-4827: expose bm25 parameters in PassageScorer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456597 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../postingshighlight/PassageScorer.java | 32 ++++++-- .../TestPostingsHighlighterRanking.java | 77 +++++++++++++++++++ .../highlight/PostingsSolrHighlighter.java | 21 ++++- 4 files changed, 127 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f268e9b8667..71dcc9fa1fb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -73,6 +73,9 @@ New Features now highlights the entire content as a single Passage. (Robert Muir, Mike McCandless) +* LUCENE-4816: Add additional ctor to PostingsHighlighter PassageScorer + to provide bm25 k1,b,avgdl parameters. (Robert Muir) + Optimizations * LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java index 4d364502795..1e732f9d20e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java @@ -30,15 +30,37 @@ public class PassageScorer { // TODO: this formula is completely made up. It might not provide relevant snippets! /** BM25 k1 parameter, controls term frequency normalization */ - public static final float k1 = 1.2f; + final float k1; /** BM25 b parameter, controls length normalization. */ - public static final float b = 0.75f; + final float b; + /** A pivot used for length normalization. */ + final float pivot; /** - * A pivot used for length normalization. - * The default value is the typical average English sentence length. + * Creates PassageScorer with these default values: + * */ - public static final float pivot = 87f; + public PassageScorer() { + // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ? + // 87 is typical average english sentence length. + this(1.2f, 0.75f, 87f); + } + + /** + * Creates PassageScorer with specified scoring parameters + * @param k1 Controls non-linear term frequency normalization (saturation). + * @param b Controls to what degree passage length normalizes tf values. + * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters). + */ + public PassageScorer(float k1, float b, float pivot) { + this.k1 = k1; + this.b = b; + this.pivot = pivot; + } /** * Computes term importance, given its in-document statistics. diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java index f4cbb3e7a93..1caf2bcdd52 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java @@ -31,6 +31,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.Term; @@ -38,6 +39,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; @@ -237,4 +239,79 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase { return "Pair [start=" + start + ", end=" + end + "]"; } } + + /** sets b=0 to disable passage length normalization */ + public void testCustomB() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " + + "you have no idea how painful it was for me to type this long sentence into my IDE."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, + BreakIterator.getSentenceInstance(Locale.ROOT), + new PassageScorer(1.2f, 0, 87), + new PassageFormatter()); + Query query = new TermQuery(new Term("body", "test")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1); + assertEquals(1, snippets.length); + assertTrue(snippets[0].startsWith("This test is a better test")); + + ir.close(); + dir.close(); + } + + /** sets k1=0 for simple coordinate-level match (# of query terms present) */ + public void testCustomK1() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This has only foo foo. " + + "On the other hand this sentence contains both foo and bar. " + + "This has only bar bar bar bar bar bar bar bar bar bar bar bar."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(10000, + BreakIterator.getSentenceInstance(Locale.ROOT), + new PassageScorer(0, 0.75f, 87), + new PassageFormatter()); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1); + assertEquals(1, snippets.length); + assertTrue(snippets[0].startsWith("On the other hand")); + + ir.close(); + dir.close(); + } } diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java index 7f0d2ed159a..60b077f82a3 100644 --- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java @@ -55,6 +55,9 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; * preTag="&lt;em&gt;" * postTag="&lt;/em&gt;" * ellipsis="... " + * k1="1.2" + * b="0.75" + * pivot="87" * maxLength=10000/> * </searchComponent> * @@ -78,7 +81,23 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn public void init(PluginInfo info) { Map attributes = info.attributes; BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT); - PassageScorer scorer = new PassageScorer(); + + // scorer parameters: k1/b/pivot + String k1 = attributes.get("k1"); + if (k1 == null) { + k1 = "1.2"; + } + + String b = attributes.get("b"); + if (b == null) { + b = "0.75"; + } + + String pivot = attributes.get("pivot"); + if (pivot == null) { + pivot = "87"; + } + PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot)); // formatter parameters: preTag/postTag/ellipsis String preTag = attributes.get("preTag");