diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 30e15871edb..b4d6f8e50f2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -904,6 +904,9 @@ New features
cause a ParseException (depending on whether strict parsing is enabled).
(Luca Cavanna via Chris Male)
+* LUCENE-3440: Add ordered fragments feature with IDF-weighted terms for FVH.
+ (Sebastian Lutze via Koji Sekiguchi)
+
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
index 9dd43110e75..60288a5a4ba 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
@@ -150,7 +150,7 @@ public class FieldPhraseList {
}
/**
- * @return the termInfos
+ * @return the termInfos
*/
public List getTermsInfos() {
return termsInfos;
@@ -164,7 +164,7 @@ public class FieldPhraseList {
this.boost = boost;
this.seqnum = seqnum;
- // now we keep TermInfos for further operations
+ // We keep TermInfos for further operations
termsInfos = new ArrayList( terms );
termsOffsets = new ArrayList( terms.size() );
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
index 5f0aeaf21d0..d9f0b473469 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java
@@ -42,12 +42,13 @@ public class SimpleFieldFragList extends FieldFragList {
*/
@Override
public void add( int startOffset, int endOffset, List phraseInfoList ) {
- float score = 0;
+ float totalBoost = 0;
List subInfos = new ArrayList();
for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
- score += phraseInfo.getBoost();
+ totalBoost += phraseInfo.getBoost();
}
- getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, score ) );
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
}
+
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
new file mode 100644
index 00000000000..54122ff3267
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java
@@ -0,0 +1,76 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * A weighted implementation of {@link FieldFragList}.
+ */
+public class WeightedFieldFragList extends FieldFragList {
+
+ /**
+ * a constructor.
+ *
+ * @param fragCharSize the length (number of chars) of a fragment
+ */
+ public WeightedFieldFragList( int fragCharSize ) {
+ super( fragCharSize );
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List phraseInfoList )
+ */
+ @Override
+ public void add( int startOffset, int endOffset, List phraseInfoList ) {
+
+ float totalBoost = 0;
+
+ List subInfos = new ArrayList();
+
+ HashSet distinctTerms = new HashSet();
+
+ int length = 0;
+
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+
+ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+
+ for ( TermInfo ti : phraseInfo.getTermsInfos()) {
+ if ( distinctTerms.add( ti.getText() ) )
+ totalBoost += ti.getWeight() * phraseInfo.getBoost();
+ length++;
+ }
+ }
+
+ // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query
+ // would cause an equal weight for all fragments regardless of how much words they contain.
+ // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments
+ // we "bend" the length with a standard-normalization a little bit.
+ totalBoost *= length * ( 1 / Math.sqrt( length ) );
+
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+ }
+
+}
\ No newline at end of file
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
new file mode 100644
index 00000000000..655a6c7114c
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java
@@ -0,0 +1,41 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A weighted implementation of {@link FragListBuilder}.
+ */
+public class WeightedFragListBuilder extends BaseFragListBuilder {
+
+ public WeightedFragListBuilder() {
+ super();
+ }
+
+ public WeightedFragListBuilder(int margin) {
+ super(margin);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize)
+ */
+ @Override
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+ return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize );
+ }
+
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
index d713fa2428f..f8f17414f8c 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
@@ -27,9 +27,9 @@ This is an another highlighter implementation.
support multi-term (includes wildcard, range, regexp, etc) queries
need Java 1.5
highlight fields need to be stored with Positions and Offsets
-take into account query boost to score fragments
+take into account query boost and/or IDF-weight to score fragments
support colored highlight tags
-pluggable FragListBuilder
+pluggable FragListBuilder / FieldFragList
pluggable FragmentsBuilder
@@ -122,9 +122,8 @@ by reference to QueryPhraseMap
and FieldTermStack
.
+----------------+-----------------+---+
The type of each entry is WeightedPhraseInfo
that consists of
-an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
-calculate the weight) will be taken into account when Fast Vector Highlighter creates
-{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.
+an array of terms offsets and weight.
+
Step 4.
In Step 4, Fast Vector Highlighter creates FieldFragList
by reference to
FieldPhraseList
. In this sample case, the following
@@ -137,6 +136,59 @@ calculate the weight) will be taken into account when Fast Vector Highlighter cr
|totalBoost=3 |
+---------------------------------+
+
+
+The calculation for each FieldFragList.WeightedFragInfo.totalBoost
(weight)
+depends on the implementation of FieldFragList.add( ... )
:
+
+ public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
+ float totalBoost = 0;
+ List<SubInfo> subInfos = new ArrayList<SubInfo>();
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
+ totalBoost += phraseInfo.getBoost();
+ }
+ getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
+ }
+
+
+The used implementation of FieldFragList
is noted in BaseFragListBuilder.createFieldFragList( ... )
:
+
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
+ return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
+ }
+
+
+Currently there are basically to approaches available:
+
+
+SimpleFragListBuilder using SimpleFieldFragList
: sum-of-boosts-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0
+WeightedFragListBuilder using WeightedFieldFragList
: sum-of-distinct-weights-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.
+
+Comparison of the two approaches:
+
+
+ query = das alte testament (The Old Testament)
+
+Terms in fragment | sum-of-distinct-weights | sum-of-boosts |
+das alte testament | 5.339621 | 3.0 |
+das alte testament | 5.339621 | 3.0 |
+das testament alte | 5.339621 | 3.0 |
+das alte testament | 5.339621 | 3.0 |
+das testament | 2.9455688 | 2.0 |
+das alte | 2.4759595 | 2.0 |
+das das das das | 1.5015357 | 4.0 |
+das das das | 1.3003681 | 3.0 |
+das das | 1.061746 | 2.0 |
+alte | 1.0 | 1.0 |
+alte | 1.0 | 1.0 |
+das | 0.7507678 | 1.0 |
+das | 0.7507678 | 1.0 |
+das | 0.7507678 | 1.0 |
+das | 0.7507678 | 1.0 |
+das | 0.7507678 | 1.0 |
+
+
Step 5.
In Step 5, by using FieldFragList
and the field stored data,
Fast Vector Highlighter creates highlighted snippets!
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
new file mode 100644
index 00000000000..9d7d566108e
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class WeightedFragListBuilderTest extends AbstractTestCase {
+
+ public void test2WeightedFragList() throws Exception {
+
+ makeIndexLongMV();
+
+ FieldQuery fq = new FieldQuery( pqF( "the", "both" ), true, true );
+ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+ FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+ WeightedFragListBuilder wflb = new WeightedFragListBuilder();
+ FieldFragList ffl = wflb.createFieldFragList( fpl, 100 );
+ assertEquals( 1, ffl.getFragInfos().size() );
+ assertEquals( "subInfos=(theboth((195,203)))/0.86791086(189,289)", ffl.getFragInfos().get( 0 ).toString() );
+ }
+
+}