LUCENE-1889: add MultiTermQuery support for FVH

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166954 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-09-09 01:36:53 +00:00
parent 896afc4c01
commit 9e15eeaa86
8 changed files with 120 additions and 21 deletions

View File

@ -83,6 +83,8 @@ New Features
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
* LUCENE-1889: Add MultiTermQuery support for FVH. (Mike Sokolov via Koji Sekiguchi)
Bug Fixes
* LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the

View File

@ -77,7 +77,22 @@ public class FastVectorHighlighter {
* @return the created {@link FieldQuery} object
*/
public FieldQuery getFieldQuery( Query query ) {
return new FieldQuery( query, phraseHighlight, fieldMatch );
try {
return new FieldQuery( query, null, phraseHighlight, fieldMatch );
} catch (IOException e) {
// should never be thrown when reader is null
throw new RuntimeException (e);
}
}
/**
* create a {@link FieldQuery} object.
*
* @param query a query
* @return the created {@link FieldQuery} object
*/
public FieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException {
return new FieldQuery( query, reader, phraseHighlight, fieldMatch );
}
/**

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@ -24,10 +25,12 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
@ -51,16 +54,19 @@ public class FieldQuery {
int termOrPhraseNumber; // used for colored tag support
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){
// The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024;
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this.fieldMatch = fieldMatch;
Set<Query> flatQueries = new HashSet<Query>();
flatten( query, flatQueries );
saveTerms( flatQueries );
flatten( query, reader, flatQueries );
saveTerms( flatQueries, reader );
Collection<Query> expandQueries = expand( flatQueries );
for( Query flatQuery : expandQueries ){
QueryPhraseMap rootMap = getRootMap( flatQuery );
rootMap.add( flatQuery );
rootMap.add( flatQuery, reader );
if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)flatQuery;
if( pq.getTerms().length > 1 ){
@ -71,24 +77,37 @@ public class FieldQuery {
}
}
void flatten( Query sourceQuery, Collection<Query> flatQueries ){
/** For backwards compatibility you can initialize FieldQuery without
* an IndexReader, which is only required to support MultiTermQuery
*/
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this (query, null, phraseHighlight, fieldMatch);
}
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
if( sourceQuery instanceof BooleanQuery ){
BooleanQuery bq = (BooleanQuery)sourceQuery;
for( BooleanClause clause : bq.getClauses() ){
if( !clause.isProhibited() )
flatten( clause.getQuery(), flatQueries );
flatten( clause.getQuery(), reader, flatQueries );
}
}
else if( sourceQuery instanceof DisjunctionMaxQuery ){
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
for( Query query : dmq ){
flatten( query, flatQueries );
flatten( query, reader, flatQueries );
}
}
else if( sourceQuery instanceof TermQuery ){
if( !flatQueries.contains( sourceQuery ) )
flatQueries.add( sourceQuery );
}
else if (sourceQuery instanceof MultiTermQuery) {
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
flatten(mtqTerms, reader, flatQueries);
}
else if( sourceQuery instanceof PhraseQuery ){
if( !flatQueries.contains( sourceQuery ) ){
PhraseQuery pq = (PhraseQuery)sourceQuery;
@ -207,6 +226,9 @@ public class FieldQuery {
Term[] terms = pq.getTerms();
return terms[0].field();
}
else if (query instanceof MultiTermQuery) {
return ((MultiTermQuery)query).getField();
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
@ -233,7 +255,7 @@ public class FieldQuery {
* - fieldMatch==false
* termSetMap=Map<null,Set<"john","lennon">>
*/
void saveTerms( Collection<Query> flatQueries ){
void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
for( Query query : flatQueries ){
Set<String> termSet = getTermSet( query );
if( query instanceof TermQuery )
@ -242,6 +264,12 @@ public class FieldQuery {
for( Term term : ((PhraseQuery)query).getTerms() )
termSet.add( term.text() );
}
else if (query instanceof MultiTermQuery && reader != null) {
BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
for (BooleanClause clause : mtqTerms.getClauses()) {
termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
}
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
@ -319,7 +347,7 @@ public class FieldQuery {
return map;
}
void add( Query query ){
void add( Query query, IndexReader reader ) throws IOException {
if( query instanceof TermQuery ){
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
}

View File

@ -72,6 +72,10 @@ public class FieldTermStack {
public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
this.fieldName = fieldName;
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
if( tfv == null ) return; // just return to make null snippets
TermPositionVector tpv = null;
@ -82,9 +86,6 @@ public class FieldTermStack {
return; // just return to make null snippets
}
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
final CharsRef spare = new CharsRef();
for( BytesRef term : tpv.getTerms() ){
if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;

View File

@ -24,6 +24,7 @@ This is an another highlighter implementation.
<li>fast for large docs</li>
<li>support N-gram fields</li>
<li>support phrase-unit highlighting with slops</li>
<li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
<li>need Java 1.5</li>
<li>highlight fields need to be stored with Positions and Offsets</li>
<li>take into account query boost to score fragments</li>

View File

@ -16,19 +16,23 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.BytesRef;
@ -48,7 +52,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery(booleanQuery, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten(booleanQuery, flatQueries);
fq.flatten(booleanQuery, reader, flatQueries);
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) );
}
@ -56,7 +60,7 @@ public class FieldQueryTest extends AbstractTestCase {
Query query = dmq( tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries );
fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
}
@ -70,7 +74,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery(booleanQuery, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten(booleanQuery, flatQueries);
fq.flatten(booleanQuery, reader, flatQueries);
assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) );
}
@ -82,7 +86,7 @@ public class FieldQueryTest extends AbstractTestCase {
FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries );
fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) );
}
@ -90,7 +94,7 @@ public class FieldQueryTest extends AbstractTestCase {
Query query = pqF( "A" );
FieldQuery fq = new FieldQuery( query, true, true );
Set<Query> flatQueries = new HashSet<Query>();
fq.flatten( query, flatQueries );
fq.flatten( query, reader, flatQueries );
assertCollectionQueries( flatQueries, tq( "A" ) );
}
@ -869,4 +873,36 @@ public class FieldQueryTest extends AbstractTestCase {
phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) );
assertNull( fq.searchPhrase( F, phraseCandidate ) );
}
public void testHighlightQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new WildcardQuery(new Term(F, "d*g")));
}
public void testPrefixQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new PrefixQuery(new Term(F, "de")));
}
public void testRegexpQuery() throws Exception {
makeIndexStrMV();
Term term = new Term(F, "d[a-z].g");
defgMultiTermQueryTest(new RegexpQuery (term));
}
public void testRangeQuery() throws Exception {
makeIndexStrMV();
defgMultiTermQueryTest(new TermRangeQuery (F, new BytesRef("d"), new BytesRef("e"), true, true));
}
private void defgMultiTermQueryTest(Query query) throws IOException {
FieldQuery fq = new FieldQuery( query, reader, true, true );
QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg");
assertNotNull (qpm);
assertNull (fq.getFieldTermMap(F, "dog"));
List<TermInfo> phraseCandidate = new ArrayList<TermInfo>();
phraseCandidate.add( new TermInfo( "defg", 0, 12, 0 ) );
assertNotNull (fq.searchPhrase(F, phraseCandidate));
}
}

View File

@ -16,8 +16,10 @@ package org.apache.lucene.search.vectorhighlight;
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.WildcardQuery;
public class FieldTermStackTest extends AbstractTestCase {
@ -158,4 +160,17 @@ public class FieldTermStackTest extends AbstractTestCase {
assertEquals( "ee(90,92,63)", stack.pop().toString() );
assertEquals( "ed(91,93,64)", stack.pop().toString() );
}
public void testWildcard() throws Exception {
makeIndexLongMV();
FieldQuery fq = new FieldQuery( new WildcardQuery (new Term(F, "th*e")), reader, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
assertEquals (4, stack.termList.size());
assertEquals ("the(15,18,2)", stack.pop().toString());
assertEquals ("these(133,138,20)", stack.pop().toString());
assertEquals ("the(153,156,23)", stack.pop().toString());
assertEquals ("the(195,198,31)", stack.pop().toString());
}
}

View File

@ -95,11 +95,12 @@ public class SearchTravRetVectorHighlightTask extends SearchTravTask {
@Override
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
highlighter = new FastVectorHighlighter( false, false );
final FieldQuery fq = highlighter.getFieldQuery( q );
final Query myq = q;
return new BenchmarkHighlighter(){
@Override
public int doHighlight(IndexReader reader, int doc, String field,
Document document, Analyzer analyzer, String text) throws Exception {
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
return fragments != null ? fragments.length : 0;
}