mirror of https://github.com/apache/lucene.git
LUCENE-4133: FVH: A weighted approach for ordered fragments, part of LUCENE-3440
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1349361 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4b212a7c4f
commit
2210749fbd
|
@ -904,6 +904,9 @@ New features
|
|||
cause a ParseException (depending on whether strict parsing is enabled).
|
||||
(Luca Cavanna via Chris Male)
|
||||
|
||||
* LUCENE-3440: Add ordered fragments feature with IDF-weighted terms for FVH.
|
||||
(Sebastian Lutze via Koji Sekiguchi)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||
|
|
|
@ -150,7 +150,7 @@ public class FieldPhraseList {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return the termInfos
|
||||
* @return the termInfos
|
||||
*/
|
||||
public List<TermInfo> getTermsInfos() {
|
||||
return termsInfos;
|
||||
|
@ -164,7 +164,7 @@ public class FieldPhraseList {
|
|||
this.boost = boost;
|
||||
this.seqnum = seqnum;
|
||||
|
||||
// now we keep TermInfos for further operations
|
||||
// We keep TermInfos for further operations
|
||||
termsInfos = new ArrayList<TermInfo>( terms );
|
||||
|
||||
termsOffsets = new ArrayList<Toffs>( terms.size() );
|
||||
|
|
|
@ -42,12 +42,13 @@ public class SimpleFieldFragList extends FieldFragList {
|
|||
*/
|
||||
@Override
|
||||
public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
|
||||
float score = 0;
|
||||
float totalBoost = 0;
|
||||
List<SubInfo> subInfos = new ArrayList<SubInfo>();
|
||||
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
|
||||
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
|
||||
score += phraseInfo.getBoost();
|
||||
totalBoost += phraseInfo.getBoost();
|
||||
}
|
||||
getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, score ) );
|
||||
getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
|
||||
|
||||
/**
|
||||
* A weighted implementation of {@link FieldFragList}.
|
||||
*/
|
||||
public class WeightedFieldFragList extends FieldFragList {
|
||||
|
||||
/**
|
||||
* a constructor.
|
||||
*
|
||||
* @param fragCharSize the length (number of chars) of a fragment
|
||||
*/
|
||||
public WeightedFieldFragList( int fragCharSize ) {
|
||||
super( fragCharSize );
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList )
|
||||
*/
|
||||
@Override
|
||||
public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
|
||||
|
||||
float totalBoost = 0;
|
||||
|
||||
List<SubInfo> subInfos = new ArrayList<SubInfo>();
|
||||
|
||||
HashSet<String> distinctTerms = new HashSet<String>();
|
||||
|
||||
int length = 0;
|
||||
|
||||
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
|
||||
|
||||
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
|
||||
|
||||
for ( TermInfo ti : phraseInfo.getTermsInfos()) {
|
||||
if ( distinctTerms.add( ti.getText() ) )
|
||||
totalBoost += ti.getWeight() * phraseInfo.getBoost();
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
// We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query
|
||||
// would cause an equal weight for all fragments regardless of how much words they contain.
|
||||
// To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments
|
||||
// we "bend" the length with a standard-normalization a little bit.
|
||||
totalBoost *= length * ( 1 / Math.sqrt( length ) );
|
||||
|
||||
getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A weighted implementation of {@link FragListBuilder}.
|
||||
*/
|
||||
public class WeightedFragListBuilder extends BaseFragListBuilder {
|
||||
|
||||
public WeightedFragListBuilder() {
|
||||
super();
|
||||
}
|
||||
|
||||
public WeightedFragListBuilder(int margin) {
|
||||
super(margin);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
|
||||
*/
|
||||
@Override
|
||||
public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
|
||||
return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize );
|
||||
}
|
||||
|
||||
}
|
|
@ -27,9 +27,9 @@ This is an another highlighter implementation.
|
|||
<li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
|
||||
<li>need Java 1.5</li>
|
||||
<li>highlight fields need to be stored with Positions and Offsets</li>
|
||||
<li>take into account query boost to score fragments</li>
|
||||
<li>take into account query boost and/or IDF-weight to score fragments</li>
|
||||
<li>support colored highlight tags</li>
|
||||
<li>pluggable FragListBuilder</li>
|
||||
<li>pluggable FragListBuilder / FieldFragList</li>
|
||||
<li>pluggable FragmentsBuilder</li>
|
||||
</ul>
|
||||
|
||||
|
@ -122,9 +122,8 @@ by reference to <code>QueryPhraseMap</code> and <code>FieldTermStack</code>.</p>
|
|||
+----------------+-----------------+---+
|
||||
</pre>
|
||||
<p>The type of each entry is <code>WeightedPhraseInfo</code> that consists of
|
||||
an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
|
||||
calculate the weight) will be taken into account when Fast Vector Highlighter creates
|
||||
{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.</p>
|
||||
an array of terms offsets and weight.
|
||||
</p>
|
||||
<h3>Step 4.</h3>
|
||||
<p>In Step 4, Fast Vector Highlighter creates <code>FieldFragList</code> by reference to
|
||||
<code>FieldPhraseList</code>. In this sample case, the following
|
||||
|
@ -137,6 +136,59 @@ calculate the weight) will be taken into account when Fast Vector Highlighter cr
|
|||
|totalBoost=3 |
|
||||
+---------------------------------+
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
The calculation for each <code>FieldFragList.WeightedFragInfo.totalBoost</code> (weight)
|
||||
depends on the implementation of <code>FieldFragList.add( ... )</code>:
|
||||
<pre class="prettyprint">
|
||||
public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
|
||||
float totalBoost = 0;
|
||||
List<SubInfo> subInfos = new ArrayList<SubInfo>();
|
||||
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
|
||||
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
|
||||
totalBoost += phraseInfo.getBoost();
|
||||
}
|
||||
getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
|
||||
}
|
||||
|
||||
</pre>
|
||||
The used implementation of <code>FieldFragList</code> is noted in <code>BaseFragListBuilder.createFieldFragList( ... )</code>:
|
||||
<pre class="prettyprint">
|
||||
public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
|
||||
return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
Currently there are basically to approaches available:
|
||||
</p>
|
||||
<ul>
|
||||
<li><code>SimpleFragListBuilder using SimpleFieldFragList</code>: <i>sum-of-boosts</i>-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0</li>
|
||||
<li><code>WeightedFragListBuilder using WeightedFieldFragList</code>: <i>sum-of-distinct-weights</i>-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.</li>
|
||||
</ul>
|
||||
<p>Comparison of the two approaches:</p>
|
||||
<table border="1">
|
||||
<caption>
|
||||
query = das alte testament (The Old Testament)
|
||||
</caption>
|
||||
<tr><th>Terms in fragment</th><th>sum-of-distinct-weights</th><th>sum-of-boosts</th></tr>
|
||||
<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
|
||||
<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
|
||||
<tr><td>das testament alte</td><td>5.339621</td><td>3.0</td></tr>
|
||||
<tr><td>das alte testament</td><td>5.339621</td><td>3.0</td></tr>
|
||||
<tr><td>das testament</td><td>2.9455688</td><td>2.0</td></tr>
|
||||
<tr><td>das alte</td><td>2.4759595</td><td>2.0</td></tr>
|
||||
<tr><td>das das das das</td><td>1.5015357</td><td>4.0</td></tr>
|
||||
<tr><td>das das das</td><td>1.3003681</td><td>3.0</td></tr>
|
||||
<tr><td>das das</td><td>1.061746</td><td>2.0</td></tr>
|
||||
<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
|
||||
<tr><td>alte</td><td>1.0</td><td>1.0</td></tr>
|
||||
<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
|
||||
<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
|
||||
<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
|
||||
<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
|
||||
<tr><td>das</td><td>0.7507678</td><td>1.0</td></tr>
|
||||
</table>
|
||||
|
||||
<h3>Step 5.</h3>
|
||||
<p>In Step 5, by using <code>FieldFragList</code> and the field stored data,
|
||||
Fast Vector Highlighter creates highlighted snippets!</p>
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package org.apache.lucene.search.vectorhighlight;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class WeightedFragListBuilderTest extends AbstractTestCase {
|
||||
|
||||
public void test2WeightedFragList() throws Exception {
|
||||
|
||||
makeIndexLongMV();
|
||||
|
||||
FieldQuery fq = new FieldQuery( pqF( "the", "both" ), true, true );
|
||||
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
|
||||
FieldPhraseList fpl = new FieldPhraseList( stack, fq );
|
||||
WeightedFragListBuilder wflb = new WeightedFragListBuilder();
|
||||
FieldFragList ffl = wflb.createFieldFragList( fpl, 100 );
|
||||
assertEquals( 1, ffl.getFragInfos().size() );
|
||||
assertEquals( "subInfos=(theboth((195,203)))/0.86791086(189,289)", ffl.getFragInfos().get( 0 ).toString() );
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue