LUCENE-1889: add MultiTermQuery support for FVH

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166954 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-09-09 01:36:53 +00:00
parent 896afc4c01
commit 9e15eeaa86
8 changed files with 120 additions and 21 deletions

View File

@ -83,6 +83,8 @@ New Features
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi) can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
* LUCENE-1889: Add MultiTermQuery support for FVH. (Mike Sokolov via Koji Sekiguchi)
Bug Fixes Bug Fixes
* LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the * LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the

View File

@ -76,8 +76,23 @@ public class FastVectorHighlighter {
* @param query a query * @param query a query
* @return the created {@link FieldQuery} object * @return the created {@link FieldQuery} object
*/ */
public FieldQuery getFieldQuery( Query query ){ public FieldQuery getFieldQuery( Query query ) {
return new FieldQuery( query, phraseHighlight, fieldMatch ); try {
return new FieldQuery( query, null, phraseHighlight, fieldMatch );
} catch (IOException e) {
// should never be thrown when reader is null
throw new RuntimeException (e);
}
}
/**
* create a {@link FieldQuery} object.
*
* @param query a query
* @return the created {@link FieldQuery} object
*/
public FieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException {
return new FieldQuery( query, reader, phraseHighlight, fieldMatch );
} }
/** /**

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -24,10 +25,12 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
@ -51,16 +54,19 @@ public class FieldQuery {
int termOrPhraseNumber; // used for colored tag support int termOrPhraseNumber; // used for colored tag support
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){ // The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024;
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this.fieldMatch = fieldMatch; this.fieldMatch = fieldMatch;
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
flatten( query, flatQueries ); flatten( query, reader, flatQueries );
saveTerms( flatQueries ); saveTerms( flatQueries, reader );
Collection<Query> expandQueries = expand( flatQueries ); Collection<Query> expandQueries = expand( flatQueries );
for( Query flatQuery : expandQueries ){ for( Query flatQuery : expandQueries ){
QueryPhraseMap rootMap = getRootMap( flatQuery ); QueryPhraseMap rootMap = getRootMap( flatQuery );
rootMap.add( flatQuery ); rootMap.add( flatQuery, reader );
if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)flatQuery; PhraseQuery pq = (PhraseQuery)flatQuery;
if( pq.getTerms().length > 1 ){ if( pq.getTerms().length > 1 ){
@ -71,24 +77,37 @@ public class FieldQuery {
} }
} }
void flatten( Query sourceQuery, Collection<Query> flatQueries ){ /** For backwards compatibility you can initialize FieldQuery without
* an IndexReader, which is only required to support MultiTermQuery
*/
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this (query, null, phraseHighlight, fieldMatch);
}
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
if( sourceQuery instanceof BooleanQuery ){ if( sourceQuery instanceof BooleanQuery ){
BooleanQuery bq = (BooleanQuery)sourceQuery; BooleanQuery bq = (BooleanQuery)sourceQuery;
for( BooleanClause clause : bq.getClauses() ){ for( BooleanClause clause : bq.getClauses() ){
if( !clause.isProhibited() ) if( !clause.isProhibited() )
flatten( clause.getQuery(), flatQueries ); flatten( clause.getQuery(), reader, flatQueries );
} }
} }
else if( sourceQuery instanceof DisjunctionMaxQuery ){ else if( sourceQuery instanceof DisjunctionMaxQuery ){
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
for( Query query : dmq ){ for( Query query : dmq ){
flatten( query, flatQueries ); flatten( query, reader, flatQueries );
} }
} }
else if( sourceQuery instanceof TermQuery ){ else if( sourceQuery instanceof TermQuery ){
if( !flatQueries.contains( sourceQuery ) ) if( !flatQueries.contains( sourceQuery ) )
flatQueries.add( sourceQuery ); flatQueries.add( sourceQuery );
} }
else if (sourceQuery instanceof MultiTermQuery) {
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
flatten(mtqTerms, reader, flatQueries);
}
else if( sourceQuery instanceof PhraseQuery ){ else if( sourceQuery instanceof PhraseQuery ){
if( !flatQueries.contains( sourceQuery ) ){ if( !flatQueries.contains( sourceQuery ) ){
PhraseQuery pq = (PhraseQuery)sourceQuery; PhraseQuery pq = (PhraseQuery)sourceQuery;
@ -207,6 +226,9 @@ public class FieldQuery {
Term[] terms = pq.getTerms(); Term[] terms = pq.getTerms();
return terms[0].field(); return terms[0].field();
} }
else if (query instanceof MultiTermQuery) {
return ((MultiTermQuery)query).getField();
}
else else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
} }
@ -233,7 +255,7 @@ public class FieldQuery {
* - fieldMatch==false * - fieldMatch==false
* termSetMap=Map<null,Set<"john","lennon">> * termSetMap=Map<null,Set<"john","lennon">>
*/ */
void saveTerms( Collection<Query> flatQueries ){ void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
for( Query query : flatQueries ){ for( Query query : flatQueries ){
Set<String> termSet = getTermSet( query ); Set<String> termSet = getTermSet( query );
if( query instanceof TermQuery ) if( query instanceof TermQuery )
@ -242,6 +264,12 @@ public class FieldQuery {
for( Term term : ((PhraseQuery)query).getTerms() ) for( Term term : ((PhraseQuery)query).getTerms() )
termSet.add( term.text() ); termSet.add( term.text() );
} }
else if (query instanceof MultiTermQuery && reader != null) {
BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
for (BooleanClause clause : mtqTerms.getClauses()) {
termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
}
}
else else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
} }
@ -319,7 +347,7 @@ public class FieldQuery {
return map; return map;
} }
void add( Query query ){ void add( Query query, IndexReader reader ) throws IOException {
if( query instanceof TermQuery ){ if( query instanceof TermQuery ){
addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
} }

View File

@ -72,6 +72,10 @@ public class FieldTermStack {
public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException { public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
this.fieldName = fieldName; this.fieldName = fieldName;
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName ); TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
if( tfv == null ) return; // just return to make null snippets if( tfv == null ) return; // just return to make null snippets
TermPositionVector tpv = null; TermPositionVector tpv = null;
@ -82,9 +86,6 @@ public class FieldTermStack {
return; // just return to make null snippets return; // just return to make null snippets
} }
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
final CharsRef spare = new CharsRef(); final CharsRef spare = new CharsRef();
for( BytesRef term : tpv.getTerms() ){ for( BytesRef term : tpv.getTerms() ){
if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue; if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;

View File

@ -24,6 +24,7 @@ This is an another highlighter implementation.
<li>fast for large docs</li> <li>fast for large docs</li>
<li>support N-gram fields</li> <li>support N-gram fields</li>
<li>support phrase-unit highlighting with slops</li> <li>support phrase-unit highlighting with slops</li>
<li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
<li>need Java 1.5</li> <li>need Java 1.5</li>
<li>highlight fields need to be stored with Positions and Offsets</li> <li>highlight fields need to be stored with Positions and Offsets</li>
<li>take into account query boost to score fragments</li> <li>take into account query boost to score fragments</li>

View File

@ -16,19 +16,23 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -48,7 +52,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery(booleanQuery, true, true ); FieldQuery fq = new FieldQuery(booleanQuery, true, true );
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
fq.flatten(booleanQuery, flatQueries); fq.flatten(booleanQuery, reader, flatQueries);
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) ); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) );
} }
@ -56,7 +60,7 @@ public class FieldQueryTest extends AbstractTestCase {
Query query = dmq( tq( "A" ), tq( "B" ), pqF( "C", "D" ) ); Query query = dmq( tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
FieldQuery fq = new FieldQuery( query, true, true ); FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries ); fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), pqF( "C", "D" ) ); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
} }
@ -70,7 +74,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery(booleanQuery, true, true ); FieldQuery fq = new FieldQuery(booleanQuery, true, true );
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
fq.flatten(booleanQuery, flatQueries); fq.flatten(booleanQuery, reader, flatQueries);
assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) ); assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) );
} }
@ -82,7 +86,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery( query, true, true ); FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries ); fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) ); assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) );
} }
@ -90,7 +94,7 @@ public class FieldQueryTest extends AbstractTestCase {
Query query = pqF( "A" ); Query query = pqF( "A" );
FieldQuery fq = new FieldQuery( query, true, true ); FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>(); Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries ); fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ) ); assertCollectionQueries( flatQueries, tq( "A" ) );
} }
@ -869,4 +873,36 @@ public class FieldQueryTest extends AbstractTestCase {
phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) );
assertNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) );
} }
public void testHighlightQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new WildcardQuery(new Term(F, "d*g")));
}
public void testPrefixQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new PrefixQuery(new Term(F, "de")));
}
public void testRegexpQuery() throws Exception {
makeIndexStrMV();
Term term = new Term(F, "d[a-z].g");
defgMultiTermQueryTest(new RegexpQuery (term));
}
public void testRangeQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new TermRangeQuery (F, new BytesRef("d"), new BytesRef("e"), true, true));
}
private void defgMultiTermQueryTest(Query query) throws IOException {
FieldQuery fq = new FieldQuery( query, reader, true, true );
QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg");
assertNotNull (qpm);
assertNull (fq.getFieldTermMap(F, "dog"));
List<TermInfo> phraseCandidate = new ArrayList<TermInfo>();
phraseCandidate.add( new TermInfo( "defg", 0, 12, 0 ) );
assertNotNull (fq.searchPhrase(F, phraseCandidate));
}
} }

View File

@ -16,8 +16,10 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.WildcardQuery;
public class FieldTermStackTest extends AbstractTestCase { public class FieldTermStackTest extends AbstractTestCase {
@ -158,4 +160,17 @@ public class FieldTermStackTest extends AbstractTestCase {
assertEquals( "ee(90,92,63)", stack.pop().toString() ); assertEquals( "ee(90,92,63)", stack.pop().toString() );
assertEquals( "ed(91,93,64)", stack.pop().toString() ); assertEquals( "ed(91,93,64)", stack.pop().toString() );
} }
public void testWildcard() throws Exception {
makeIndexLongMV();
FieldQuery fq = new FieldQuery( new WildcardQuery (new Term(F, "th*e")), reader, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
assertEquals (4, stack.termList.size());
assertEquals ("the(15,18,2)", stack.pop().toString());
assertEquals ("these(133,138,20)", stack.pop().toString());
assertEquals ("the(153,156,23)", stack.pop().toString());
assertEquals ("the(195,198,31)", stack.pop().toString());
}
} }

View File

@ -95,11 +95,12 @@ public class SearchTravRetVectorHighlightTask extends SearchTravTask {
@Override @Override
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
highlighter = new FastVectorHighlighter( false, false ); highlighter = new FastVectorHighlighter( false, false );
final FieldQuery fq = highlighter.getFieldQuery( q ); final Query myq = q;
return new BenchmarkHighlighter(){ return new BenchmarkHighlighter(){
@Override @Override
public int doHighlight(IndexReader reader, int doc, String field, public int doHighlight(IndexReader reader, int doc, String field,
Document document, Analyzer analyzer, String text) throws Exception { Document document, Analyzer analyzer, String text) throws Exception {
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags); String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
return fragments != null ? fragments.length : 0; return fragments != null ? fragments.length : 0;
} }