mirror of https://github.com/apache/lucene.git
LUCENE-4827: expose bm25 parameters in PassageScorer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456597 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3943f0f8c4
commit
53fd6a8919
|
@ -73,6 +73,9 @@ New Features
|
|||
now highlights the entire content as a single Passage. (Robert
|
||||
Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-4816: Add additional ctor to PostingsHighlighter PassageScorer
|
||||
to provide bm25 k1,b,avgdl parameters. (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
||||
|
|
|
@ -30,15 +30,37 @@ public class PassageScorer {
|
|||
// TODO: this formula is completely made up. It might not provide relevant snippets!
|
||||
|
||||
/** BM25 k1 parameter, controls term frequency normalization */
|
||||
public static final float k1 = 1.2f;
|
||||
final float k1;
|
||||
/** BM25 b parameter, controls length normalization. */
|
||||
public static final float b = 0.75f;
|
||||
final float b;
|
||||
/** A pivot used for length normalization. */
|
||||
final float pivot;
|
||||
|
||||
/**
|
||||
* A pivot used for length normalization.
|
||||
* The default value is the typical average English sentence length.
|
||||
* Creates PassageScorer with these default values:
|
||||
* <ul>
|
||||
* <li>{@code k1 = 1.2},
|
||||
* <li>{@code b = 0.75}.
|
||||
* <li>{@code pivot = 87}
|
||||
* </ul>
|
||||
*/
|
||||
public static final float pivot = 87f;
|
||||
public PassageScorer() {
|
||||
// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
|
||||
// 87 is typical average english sentence length.
|
||||
this(1.2f, 0.75f, 87f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates PassageScorer with specified scoring parameters
|
||||
* @param k1 Controls non-linear term frequency normalization (saturation).
|
||||
* @param b Controls to what degree passage length normalizes tf values.
|
||||
* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
|
||||
*/
|
||||
public PassageScorer(float k1, float b, float pivot) {
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
this.pivot = pivot;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes term importance, given its in-document statistics.
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.document.FieldType;
|
|||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -38,6 +39,7 @@ import org.apache.lucene.search.BooleanClause;
|
|||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -237,4 +239,79 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
|||
return "Pair [start=" + start + ", end=" + end + "]";
|
||||
}
|
||||
}
|
||||
|
||||
/** sets b=0 to disable passage length normalization */
|
||||
public void testCustomB() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
|
||||
"you have no idea how painful it was for me to type this long sentence into my IDE.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(1.2f, 0, 87),
|
||||
new PassageFormatter());
|
||||
Query query = new TermQuery(new Term("body", "test"));
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
|
||||
assertEquals(1, snippets.length);
|
||||
assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** sets k1=0 for simple coordinate-level match (# of query terms present) */
|
||||
public void testCustomK1() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
|
||||
body.setStringValue("This has only foo foo. " +
|
||||
"On the other hand this sentence contains both foo and bar. " +
|
||||
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
PostingsHighlighter highlighter = new PostingsHighlighter(10000,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||
new PassageScorer(0, 0.75f, 87),
|
||||
new PassageFormatter());
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
|
||||
query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
|
||||
TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
|
||||
assertEquals(1, snippets.length);
|
||||
assertTrue(snippets[0].startsWith("On the other hand"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,6 +55,9 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
|
|||
* preTag="&lt;em&gt;"
|
||||
* postTag="&lt;/em&gt;"
|
||||
* ellipsis="... "
|
||||
* k1="1.2"
|
||||
* b="0.75"
|
||||
* pivot="87"
|
||||
* maxLength=10000/>
|
||||
* </searchComponent>
|
||||
* </pre>
|
||||
|
@ -78,7 +81,23 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
|
|||
public void init(PluginInfo info) {
|
||||
Map<String,String> attributes = info.attributes;
|
||||
BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
PassageScorer scorer = new PassageScorer();
|
||||
|
||||
// scorer parameters: k1/b/pivot
|
||||
String k1 = attributes.get("k1");
|
||||
if (k1 == null) {
|
||||
k1 = "1.2";
|
||||
}
|
||||
|
||||
String b = attributes.get("b");
|
||||
if (b == null) {
|
||||
b = "0.75";
|
||||
}
|
||||
|
||||
String pivot = attributes.get("pivot");
|
||||
if (pivot == null) {
|
||||
pivot = "87";
|
||||
}
|
||||
PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
|
||||
|
||||
// formatter parameters: preTag/postTag/ellipsis
|
||||
String preTag = attributes.get("preTag");
|
||||
|
|
Loading…
Reference in New Issue