LUCENE-4827: expose bm25 parameters in PassageScorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456597 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-03-14 18:25:18 +00:00
parent 3943f0f8c4
commit 53fd6a8919
4 changed files with 127 additions and 6 deletions

View File

@ -73,6 +73,9 @@ New Features
now highlights the entire content as a single Passage. (Robert
Muir, Mike McCandless)
* LUCENE-4816: Add additional ctor to PostingsHighlighter PassageScorer
to provide bm25 k1,b,avgdl parameters. (Robert Muir)
Optimizations
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the

View File

@ -30,15 +30,37 @@ public class PassageScorer {
// TODO: this formula is completely made up. It might not provide relevant snippets!
/** BM25 k1 parameter, controls term frequency normalization */
public static final float k1 = 1.2f;
final float k1;
/** BM25 b parameter, controls length normalization. */
public static final float b = 0.75f;
final float b;
/** A pivot used for length normalization. */
final float pivot;
/**
* A pivot used for length normalization.
* The default value is the typical average English sentence length.
* Creates PassageScorer with these default values:
* <ul>
* <li>{@code k1 = 1.2},
* <li>{@code b = 0.75}.
* <li>{@code pivot = 87}
* </ul>
*/
public static final float pivot = 87f;
public PassageScorer() {
// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
// 87 is typical average english sentence length.
this(1.2f, 0.75f, 87f);
}
/**
* Creates PassageScorer with specified scoring parameters
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree passage length normalizes tf values.
* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
*/
public PassageScorer(float k1, float b, float pivot) {
this.k1 = k1;
this.b = b;
this.pivot = pivot;
}
/**
* Computes term importance, given its in-document statistics.

View File

@ -31,6 +31,7 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.Term;
@ -38,6 +39,7 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
@ -237,4 +239,79 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
return "Pair [start=" + start + ", end=" + end + "]";
}
}
/** sets b=0 to disable passage length normalization */
public void testCustomB() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
"you have no idea how painful it was for me to type this long sentence into my IDE.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(1.2f, 0, 87),
new PassageFormatter());
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
ir.close();
dir.close();
}
/** sets k1=0 for simple coordinate-level match (# of query terms present) */
public void testCustomK1() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This has only foo foo. " +
"On the other hand this sentence contains both foo and bar. " +
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
BreakIterator.getSentenceInstance(Locale.ROOT),
new PassageScorer(0, 0.75f, 87),
new PassageFormatter());
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("On the other hand"));
ir.close();
dir.close();
}
}

View File

@ -55,6 +55,9 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* preTag="&amp;lt;em&amp;gt;"
* postTag="&amp;lt;/em&amp;gt;"
* ellipsis="... "
* k1="1.2"
* b="0.75"
* pivot="87"
* maxLength=10000/&gt;
* &lt;/searchComponent&gt;
* </pre>
@ -78,7 +81,23 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
public void init(PluginInfo info) {
Map<String,String> attributes = info.attributes;
BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT);
PassageScorer scorer = new PassageScorer();
// scorer parameters: k1/b/pivot
String k1 = attributes.get("k1");
if (k1 == null) {
k1 = "1.2";
}
String b = attributes.get("b");
if (b == null) {
b = "0.75";
}
String pivot = attributes.get("pivot");
if (pivot == null) {
pivot = "87";
}
PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
// formatter parameters: preTag/postTag/ellipsis
String preTag = attributes.get("preTag");