LUCENE-4133: FVH: A weighted approach for ordered fragments, part of LUCENE-3440

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1349361 13f79535-47bb-0310-9956-ffa450edef68
2025-03-03 06:49:38 +00:00 · 2012-06-12 13:59:37 +00:00 · 2012-06-12 13:59:37 +00:00 · 2210749fbd
commit 2210749fbd
parent 4b212a7c4f
7 changed files with 218 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -904,6 +904,9 @@ New features
  cause a ParseException (depending on whether strict parsing is enabled).
  (Luca Cavanna via Chris Male) 

+* LUCENE-3440: Add ordered fragments feature with IDF-weighted terms for FVH.
+  (Sebastian Lutze via Koji Sekiguchi)
+
 Optimizations

 * LUCENE-2588: Don't store unnecessary suffixes when writing the terms
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
@ -150,7 +150,7 @@ public class FieldPhraseList {
    }

    /**
-     * @return the termInfos
+     * @return the termInfos 
     */    
    public List<TermInfo> getTermsInfos() {
      return termsInfos;
@ -164,7 +164,7 @@ public class FieldPhraseList {
      this.boost = boost;
      this.seqnum = seqnum;
      
-      // now we keep TermInfos for further operations
+      // We keep TermInfos for further operations
      termsInfos = new ArrayList<TermInfo>( terms );
      
      termsOffsets = new ArrayList<Toffs>( terms.size() );
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
@ -42,12 +42,13 @@ public class SimpleFieldFragList extends FieldFragList {
   */
  @Override
  public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
-    float score = 0;
+    float totalBoost = 0;
    List<SubInfo> subInfos = new ArrayList<SubInfo>();
    for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
      subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
-      score += phraseInfo.getBoost();
+      totalBoost += phraseInfo.getBoost();
    }
-    getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, score ) );
+    getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
  }
+  
 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
@ -0,0 +1,76 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * A weighted implementation of {@link FieldFragList}.
+ */
+public class WeightedFieldFragList extends FieldFragList {
+
+  /**
+   * a constructor.
+   * 
+   * @param fragCharSize the length (number of chars) of a fragment
+   */
+  public WeightedFieldFragList( int fragCharSize ) {
+    super( fragCharSize );
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList )
+   */ 
+  @Override
+  public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
+    
+    float totalBoost = 0;
+    
+    List<SubInfo> subInfos = new ArrayList<SubInfo>();
+    
+    HashSet<String> distinctTerms = new HashSet<String>();
+    
+    int length = 0;
+
+    for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+      
+      subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+      
+      for ( TermInfo ti :  phraseInfo.getTermsInfos()) {
+        if ( distinctTerms.add( ti.getText() ) )
+          totalBoost += ti.getWeight() * phraseInfo.getBoost();
+        length++;
+      }
+    }
+    
+    // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query
+    // would cause an equal weight for all fragments regardless of how much words they contain.  
+    // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments
+    // we "bend" the length with a standard-normalization a little bit.  
+    totalBoost *= length * ( 1 / Math.sqrt( length ) );
+    
+    getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+  }
+  
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
@ -0,0 +1,41 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A weighted implementation of {@link FragListBuilder}.
+ */
+public class WeightedFragListBuilder extends BaseFragListBuilder {
+
+  public WeightedFragListBuilder() {
+    super();
+  }
+
+  public WeightedFragListBuilder(int margin) {
+    super(margin);
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
+   */ 
+  @Override
+  public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+    return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize );
+  }
+  
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
@ -27,9 +27,9 @@ This is an another highlighter implementation.
 <li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
 <li>need Java 1.5</li>
 <li>highlight fields need to be stored with Positions and Offsets</li>
-<li>take into account query boost to score fragments</li>
+<li>take into account query boost and/or IDF-weight to score fragments</li>
 <li>support colored highlight tags</li>
-<li>pluggable FragListBuilder</li>
+<li>pluggable FragListBuilder / FieldFragList</li>
 <li>pluggable FragmentsBuilder</li>
 </ul>

@ -122,9 +122,8 @@ by reference to <code>QueryPhraseMap</code> and <code>FieldTermStack</code>.</p>
 +----------------+-----------------+---+
 </pre>
 <p>The type of each entry is <code>WeightedPhraseInfo</code> that consists of
-an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
-calculate the weight) will be taken into account when Fast Vector Highlighter creates
-{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.</p>
+an array of terms offsets and weight. 
+</p>
 <h3>Step 4.</h3>
 <p>In Step 4, Fast Vector Highlighter creates <code>FieldFragList</code> by reference to
 <code>FieldPhraseList</code>. In this sample case, the following
@ -137,6 +136,59 @@ calculate the weight) will be taken into account when Fast Vector Highlighter cr
 |totalBoost=3                     |
 +---------------------------------+
 </pre>
+
+<p>
+The calculation for each <code>FieldFragList.WeightedFragInfo.totalBoost</code> (weight)  
+depends on the implementation of <code>FieldFragList.add( ... )</code>:
+<pre class="prettyprint">
+  public void add( int startOffset, int endOffset, List&lt;WeightedPhraseInfo&gt; phraseInfoList ) {
+    float totalBoost = 0;
+    List&lt;SubInfo&gt; subInfos = new ArrayList&lt;SubInfo&gt;();
+    for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+      subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+      totalBoost += phraseInfo.getBoost();
+    }
+    getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+  }
+  
+</pre>
+The used implementation of <code>FieldFragList</code> is noted in <code>BaseFragListBuilder.createFieldFragList( ... )</code>:
+<pre class="prettyprint">
+  public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+    return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
+  }
+</pre>
+<p>
+Currently there are basically to approaches available:
+</p>
+<ul>
+<li><code>SimpleFragListBuilder using SimpleFieldFragList</code>: <i>sum-of-boosts</i>-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0</li>
+<li><code>WeightedFragListBuilder using WeightedFieldFragList</code>: <i>sum-of-distinct-weights</i>-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.</li>
+</ul> 
+<p>Comparison of the two approaches:</p>
+<table border="1">
+<caption>
+	query = das alte testament (The Old Testament)
+</caption>
+<tr><th>Terms in fragment</th><th>sum-of-distinct-weights</th><th>sum-of-boosts</th></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das testament alte</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
+<tr><td>das testament</td><td>2.9455688</td><td>2.0</td></tr>
+<tr><td>das alte</td><td>2.4759595</td><td>2.0</td></tr>
+<tr><td>das das das das</td><td>1.5015357</td><td>4.0</td></tr>
+<tr><td>das das das</td><td>1.3003681</td><td>3.0</td></tr>
+<tr><td>das das</td><td>1.061746</td><td>2.0</td></tr>
+<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
+<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
+</table>
+
 <h3>Step 5.</h3>
 <p>In Step 5, by using <code>FieldFragList</code> and the field stored data,
 Fast Vector Highlighter creates highlighted snippets!</p>
--- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
@ -0,0 +1,35 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class WeightedFragListBuilderTest extends AbstractTestCase {
+  
+  public void test2WeightedFragList() throws Exception {
+    
+    makeIndexLongMV();
+
+    FieldQuery fq = new FieldQuery( pqF( "the", "both" ), true, true );
+    FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+    FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+    WeightedFragListBuilder wflb = new WeightedFragListBuilder();
+    FieldFragList ffl = wflb.createFieldFragList( fpl, 100 );
+    assertEquals( 1, ffl.getFragInfos().size() );
+    assertEquals( "subInfos=(theboth((195,203)))/0.86791086(189,289)", ffl.getFragInfos().get( 0 ).toString() );
+  }
+
+}