From 53fd6a8919ba84ba27f140b5532af61f82f744db Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 14 Mar 2013 18:25:18 +0000
Subject: [PATCH] LUCENE-4827: expose bm25 parameters in PassageScorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1456597 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  3 +
 .../postingshighlight/PassageScorer.java      | 32 ++++++--
 .../TestPostingsHighlighterRanking.java       | 77 +++++++++++++++++++
 .../highlight/PostingsSolrHighlighter.java    | 21 ++++-
 4 files changed, 127 insertions(+), 6 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f268e9b8667..71dcc9fa1fb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -73,6 +73,9 @@ New Features
   now highlights the entire content as a single Passage.  (Robert
   Muir, Mike McCandless)
 
+* LUCENE-4816: Add additional ctor to PostingsHighlighter PassageScorer
+  to provide bm25 k1,b,avgdl parameters. (Robert Muir)
+
 Optimizations
 
 * LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
index 4d364502795..1e732f9d20e 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
@@ -30,15 +30,37 @@ public class PassageScorer {
   // TODO: this formula is completely made up. It might not provide relevant snippets!
   
   /** BM25 k1 parameter, controls term frequency normalization */
-  public static final float k1 = 1.2f;
+  final float k1;
   /** BM25 b parameter, controls length normalization. */
-  public static final float b = 0.75f;
+  final float b;
+  /** A pivot used for length normalization. */
+  final float pivot;
   
   /**
-   * A pivot used for length normalization.
-   * The default value is the typical average English sentence length.
+   * Creates PassageScorer with these default values:
+   * <ul>
+   *   <li>{@code k1 = 1.2},
+   *   <li>{@code b = 0.75}.
+   *   <li>{@code pivot = 87}
+   * </ul>
    */
-  public static final float pivot = 87f;
+  public PassageScorer() {
+    // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
+    // 87 is typical average english sentence length.
+    this(1.2f, 0.75f, 87f);
+  }
+  
+  /**
+   * Creates PassageScorer with specified scoring parameters
+   * @param k1 Controls non-linear term frequency normalization (saturation).
+   * @param b Controls to what degree passage length normalizes tf values.
+   * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
+   */
+  public PassageScorer(float k1, float b, float pivot) {
+    this.k1 = k1;
+    this.b = b;
+    this.pivot = pivot;
+  }
     
   /**
    * Computes term importance, given its in-document statistics.
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
index f4cbb3e7a93..1caf2bcdd52 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
@@ -31,6 +31,7 @@ import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.Term;
@@ -38,6 +39,7 @@ import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
@@ -237,4 +239,79 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
       return "Pair [start=" + start + ", end=" + end + "]";
     }
   }
+  
+  /** sets b=0 to disable passage length normalization */
+  public void testCustomB() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test.  This test is a better test but the sentence is excruiatingly long, " + 
+                        "you have no idea how painful it was for me to type this long sentence into my IDE.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
+                                             BreakIterator.getSentenceInstance(Locale.ROOT), 
+                                             new PassageScorer(1.2f, 0, 87), 
+                                             new PassageFormatter());
+    Query query = new TermQuery(new Term("body", "test"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
+    assertEquals(1, snippets.length);
+    assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
+    
+    ir.close();
+    dir.close();
+  }
+  
+  /** sets k1=0 for simple coordinate-level match (# of query terms present) */
+  public void testCustomK1() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This has only foo foo. " + 
+                        "On the other hand this sentence contains both foo and bar. " + 
+                        "This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
+                                             BreakIterator.getSentenceInstance(Locale.ROOT), 
+                                             new PassageScorer(0, 0.75f, 87), 
+                                             new PassageFormatter());
+    BooleanQuery query = new BooleanQuery();
+    query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
+    query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
+    assertEquals(1, snippets.length);
+    assertTrue(snippets[0].startsWith("On the other hand"));
+    
+    ir.close();
+    dir.close();
+  }
 }
diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
index 7f0d2ed159a..60b077f82a3 100644
--- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
@@ -55,6 +55,9 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
  *                      preTag="&amp;lt;em&amp;gt;"
  *                      postTag="&amp;lt;/em&amp;gt;"
  *                      ellipsis="... "
+ *                      k1="1.2"
+ *                      b="0.75"
+ *                      pivot="87"
  *                      maxLength=10000/&gt;
  *   &lt;/searchComponent&gt;
  * </pre>
@@ -78,7 +81,23 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
   public void init(PluginInfo info) {
     Map<String,String> attributes = info.attributes;
     BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT);
-    PassageScorer scorer = new PassageScorer();
+    
+    // scorer parameters: k1/b/pivot
+    String k1 = attributes.get("k1");
+    if (k1 == null) {
+      k1 = "1.2";
+    }
+    
+    String b = attributes.get("b");
+    if (b == null) {
+      b = "0.75";
+    }
+    
+    String pivot = attributes.get("pivot");
+    if (pivot == null) {
+      pivot = "87";
+    }
+    PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
     
     // formatter parameters: preTag/postTag/ellipsis
     String preTag = attributes.get("preTag");