diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 30e15871edb..b4d6f8e50f2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -904,6 +904,9 @@ New features cause a ParseException (depending on whether strict parsing is enabled). (Luca Cavanna via Chris Male) +* LUCENE-3440: Add ordered fragments feature with IDF-weighted terms for FVH. + (Sebastian Lutze via Koji Sekiguchi) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java index 9dd43110e75..60288a5a4ba 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java @@ -150,7 +150,7 @@ public class FieldPhraseList { } /** - * @return the termInfos + * @return the termInfos */ public List getTermsInfos() { return termsInfos; @@ -164,7 +164,7 @@ public class FieldPhraseList { this.boost = boost; this.seqnum = seqnum; - // now we keep TermInfos for further operations + // We keep TermInfos for further operations termsInfos = new ArrayList( terms ); termsOffsets = new ArrayList( terms.size() ); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java index 5f0aeaf21d0..d9f0b473469 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFieldFragList.java @@ -42,12 +42,13 @@ public class SimpleFieldFragList extends FieldFragList { */ @Override public void add( int startOffset, int endOffset, List phraseInfoList ) { - float score = 0; + float totalBoost = 0; List subInfos = new ArrayList(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); - score += phraseInfo.getBoost(); + totalBoost += phraseInfo.getBoost(); } - getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, score ) ); + getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); } + } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java new file mode 100644 index 00000000000..54122ff3267 --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFieldFragList.java @@ -0,0 +1,76 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; + +/** + * A weighted implementation of {@link FieldFragList}. + */ +public class WeightedFieldFragList extends FieldFragList { + + /** + * a constructor. + * + * @param fragCharSize the length (number of chars) of a fragment + */ + public WeightedFieldFragList( int fragCharSize ) { + super( fragCharSize ); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FieldFragList#add( int startOffset, int endOffset, List phraseInfoList ) + */ + @Override + public void add( int startOffset, int endOffset, List phraseInfoList ) { + + float totalBoost = 0; + + List subInfos = new ArrayList(); + + HashSet distinctTerms = new HashSet(); + + int length = 0; + + for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ + + subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); + + for ( TermInfo ti : phraseInfo.getTermsInfos()) { + if ( distinctTerms.add( ti.getText() ) ) + totalBoost += ti.getWeight() * phraseInfo.getBoost(); + length++; + } + } + + // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query + // would cause an equal weight for all fragments regardless of how much words they contain. + // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments + // we "bend" the length with a standard-normalization a little bit. + totalBoost *= length * ( 1 / Math.sqrt( length ) ); + + getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); + } + +} \ No newline at end of file diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java new file mode 100644 index 00000000000..655a6c7114c --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilder.java @@ -0,0 +1,41 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A weighted implementation of {@link FragListBuilder}. + */ +public class WeightedFragListBuilder extends BaseFragListBuilder { + + public WeightedFragListBuilder() { + super(); + } + + public WeightedFragListBuilder(int margin) { + super(margin); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.vectorhighlight.FragListBuilder#createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) + */ + @Override + public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){ + return createFieldFragList( fieldPhraseList, new WeightedFieldFragList( fragCharSize ), fragCharSize ); + } + +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html index d713fa2428f..f8f17414f8c 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html @@ -27,9 +27,9 @@ This is an another highlighter implementation.
  • support multi-term (includes wildcard, range, regexp, etc) queries
  • need Java 1.5
  • highlight fields need to be stored with Positions and Offsets
  • -
  • take into account query boost to score fragments
  • +
  • take into account query boost and/or IDF-weight to score fragments
  • support colored highlight tags
  • -
  • pluggable FragListBuilder
  • +
  • pluggable FragListBuilder / FieldFragList
  • pluggable FragmentsBuilder
  • @@ -122,9 +122,8 @@ by reference to QueryPhraseMap and FieldTermStack.

    +----------------+-----------------+---+

    The type of each entry is WeightedPhraseInfo that consists of -an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to -calculate the weight) will be taken into account when Fast Vector Highlighter creates -{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.

    +an array of terms offsets and weight. +

    Step 4.

    In Step 4, Fast Vector Highlighter creates FieldFragList by reference to FieldPhraseList. In this sample case, the following @@ -137,6 +136,59 @@ calculate the weight) will be taken into account when Fast Vector Highlighter cr |totalBoost=3 | +---------------------------------+ + +

    +The calculation for each FieldFragList.WeightedFragInfo.totalBoost (weight) +depends on the implementation of FieldFragList.add( ... ): +

    +  public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) {
    +    float totalBoost = 0;
    +    List<SubInfo> subInfos = new ArrayList<SubInfo>();
    +    for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
    +      subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) );
    +      totalBoost += phraseInfo.getBoost();
    +    }
    +    getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) );
    +  }
    +  
    +
    +The used implementation of FieldFragList is noted in BaseFragListBuilder.createFieldFragList( ... ): +
    +  public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){
    +    return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize );
    +  }
    +
    +

    +Currently there are basically to approaches available: +

    +
      +
    • SimpleFragListBuilder using SimpleFieldFragList: sum-of-boosts-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0
    • +
    • WeightedFragListBuilder using WeightedFieldFragList: sum-of-distinct-weights-approach. The totalBoost is calculated by summarizing the IDF-weights of distinct terms.
    • +
    +

    Comparison of the two approaches:

    + + + + + + + + + + + + + + + + + + + +
    + query = das alte testament (The Old Testament) +
    Terms in fragmentsum-of-distinct-weightssum-of-boosts
    das alte testament5.3396213.0
    das alte testament5.3396213.0
    das testament alte5.3396213.0
    das alte testament5.3396213.0
    das testament2.94556882.0
    das alte2.47595952.0
    das das das das1.50153574.0
    das das das1.30036813.0
    das das1.0617462.0
    alte1.01.0
    alte1.01.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    das0.75076781.0
    +

    Step 5.

    In Step 5, by using FieldFragList and the field stored data, Fast Vector Highlighter creates highlighted snippets!

    diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java new file mode 100644 index 00000000000..9d7d566108e --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/WeightedFragListBuilderTest.java @@ -0,0 +1,35 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class WeightedFragListBuilderTest extends AbstractTestCase { + + public void test2WeightedFragList() throws Exception { + + makeIndexLongMV(); + + FieldQuery fq = new FieldQuery( pqF( "the", "both" ), true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + FieldPhraseList fpl = new FieldPhraseList( stack, fq ); + WeightedFragListBuilder wflb = new WeightedFragListBuilder(); + FieldFragList ffl = wflb.createFieldFragList( fpl, 100 ); + assertEquals( 1, ffl.getFragInfos().size() ); + assertEquals( "subInfos=(theboth((195,203)))/0.86791086(189,289)", ffl.getFragInfos().get( 0 ).toString() ); + } + +}