mirror of https://github.com/apache/lucene.git
LUCENE-1889: add MultiTermQuery support for FVH
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166954 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
896afc4c01
commit
9e15eeaa86
|
@ -83,6 +83,8 @@ New Features
|
|||
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
|
||||
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
|
||||
|
||||
* LUCENE-1889: Add MultiTermQuery support for FVH. (Mike Sokolov via Koji Sekiguchi)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the
|
||||
|
|
|
@ -77,7 +77,22 @@ public class FastVectorHighlighter {
|
|||
* @return the created {@link FieldQuery} object
|
||||
*/
|
||||
public FieldQuery getFieldQuery( Query query ) {
|
||||
return new FieldQuery( query, phraseHighlight, fieldMatch );
|
||||
try {
|
||||
return new FieldQuery( query, null, phraseHighlight, fieldMatch );
|
||||
} catch (IOException e) {
|
||||
// should never be thrown when reader is null
|
||||
throw new RuntimeException (e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create a {@link FieldQuery} object.
|
||||
*
|
||||
* @param query a query
|
||||
* @return the created {@link FieldQuery} object
|
||||
*/
|
||||
public FieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException {
|
||||
return new FieldQuery( query, reader, phraseHighlight, fieldMatch );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -16,6 +16,7 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -24,10 +25,12 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
@ -51,16 +54,19 @@ public class FieldQuery {
|
|||
|
||||
int termOrPhraseNumber; // used for colored tag support
|
||||
|
||||
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){
|
||||
// The maximum number of different matching terms accumulated from any one MultiTermQuery
|
||||
private static final int MAX_MTQ_TERMS = 1024;
|
||||
|
||||
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
|
||||
this.fieldMatch = fieldMatch;
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
flatten( query, flatQueries );
|
||||
saveTerms( flatQueries );
|
||||
flatten( query, reader, flatQueries );
|
||||
saveTerms( flatQueries, reader );
|
||||
Collection<Query> expandQueries = expand( flatQueries );
|
||||
|
||||
for( Query flatQuery : expandQueries ){
|
||||
QueryPhraseMap rootMap = getRootMap( flatQuery );
|
||||
rootMap.add( flatQuery );
|
||||
rootMap.add( flatQuery, reader );
|
||||
if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
|
||||
PhraseQuery pq = (PhraseQuery)flatQuery;
|
||||
if( pq.getTerms().length > 1 ){
|
||||
|
@ -71,24 +77,37 @@ public class FieldQuery {
|
|||
}
|
||||
}
|
||||
|
||||
void flatten( Query sourceQuery, Collection<Query> flatQueries ){
|
||||
/** For backwards compatibility you can initialize FieldQuery without
|
||||
* an IndexReader, which is only required to support MultiTermQuery
|
||||
*/
|
||||
FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
|
||||
this (query, null, phraseHighlight, fieldMatch);
|
||||
}
|
||||
|
||||
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
|
||||
if( sourceQuery instanceof BooleanQuery ){
|
||||
BooleanQuery bq = (BooleanQuery)sourceQuery;
|
||||
for( BooleanClause clause : bq.getClauses() ){
|
||||
if( !clause.isProhibited() )
|
||||
flatten( clause.getQuery(), flatQueries );
|
||||
flatten( clause.getQuery(), reader, flatQueries );
|
||||
}
|
||||
}
|
||||
else if( sourceQuery instanceof DisjunctionMaxQuery ){
|
||||
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
|
||||
for( Query query : dmq ){
|
||||
flatten( query, flatQueries );
|
||||
flatten( query, reader, flatQueries );
|
||||
}
|
||||
}
|
||||
else if( sourceQuery instanceof TermQuery ){
|
||||
if( !flatQueries.contains( sourceQuery ) )
|
||||
flatQueries.add( sourceQuery );
|
||||
}
|
||||
else if (sourceQuery instanceof MultiTermQuery) {
|
||||
MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
|
||||
copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
|
||||
BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
|
||||
flatten(mtqTerms, reader, flatQueries);
|
||||
}
|
||||
else if( sourceQuery instanceof PhraseQuery ){
|
||||
if( !flatQueries.contains( sourceQuery ) ){
|
||||
PhraseQuery pq = (PhraseQuery)sourceQuery;
|
||||
|
@ -207,6 +226,9 @@ public class FieldQuery {
|
|||
Term[] terms = pq.getTerms();
|
||||
return terms[0].field();
|
||||
}
|
||||
else if (query instanceof MultiTermQuery) {
|
||||
return ((MultiTermQuery)query).getField();
|
||||
}
|
||||
else
|
||||
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
|
||||
}
|
||||
|
@ -233,7 +255,7 @@ public class FieldQuery {
|
|||
* - fieldMatch==false
|
||||
* termSetMap=Map<null,Set<"john","lennon">>
|
||||
*/
|
||||
void saveTerms( Collection<Query> flatQueries ){
|
||||
void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
|
||||
for( Query query : flatQueries ){
|
||||
Set<String> termSet = getTermSet( query );
|
||||
if( query instanceof TermQuery )
|
||||
|
@ -242,6 +264,12 @@ public class FieldQuery {
|
|||
for( Term term : ((PhraseQuery)query).getTerms() )
|
||||
termSet.add( term.text() );
|
||||
}
|
||||
else if (query instanceof MultiTermQuery && reader != null) {
|
||||
BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
|
||||
for (BooleanClause clause : mtqTerms.getClauses()) {
|
||||
termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
|
||||
}
|
||||
}
|
||||
else
|
||||
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
|
||||
}
|
||||
|
@ -319,7 +347,7 @@ public class FieldQuery {
|
|||
return map;
|
||||
}
|
||||
|
||||
void add( Query query ){
|
||||
void add( Query query, IndexReader reader ) throws IOException {
|
||||
if( query instanceof TermQuery ){
|
||||
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
|
||||
}
|
||||
|
|
|
@ -72,6 +72,10 @@ public class FieldTermStack {
|
|||
public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
|
||||
this.fieldName = fieldName;
|
||||
|
||||
Set<String> termSet = fieldQuery.getTermSet( fieldName );
|
||||
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
|
||||
if( termSet == null ) return;
|
||||
|
||||
TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
|
||||
if( tfv == null ) return; // just return to make null snippets
|
||||
TermPositionVector tpv = null;
|
||||
|
@ -82,9 +86,6 @@ public class FieldTermStack {
|
|||
return; // just return to make null snippets
|
||||
}
|
||||
|
||||
Set<String> termSet = fieldQuery.getTermSet( fieldName );
|
||||
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
|
||||
if( termSet == null ) return;
|
||||
final CharsRef spare = new CharsRef();
|
||||
for( BytesRef term : tpv.getTerms() ){
|
||||
if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;
|
||||
|
|
|
@ -24,6 +24,7 @@ This is an another highlighter implementation.
|
|||
<li>fast for large docs</li>
|
||||
<li>support N-gram fields</li>
|
||||
<li>support phrase-unit highlighting with slops</li>
|
||||
<li>support multi-term (includes wildcard, range, regexp, etc) queries</li>
|
||||
<li>need Java 1.5</li>
|
||||
<li>highlight fields need to be stored with Positions and Offsets</li>
|
||||
<li>take into account query boost to score fragments</li>
|
||||
|
|
|
@ -16,19 +16,23 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.RegexpQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -48,7 +52,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
|
||||
FieldQuery fq = new FieldQuery(booleanQuery, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten(booleanQuery, flatQueries);
|
||||
fq.flatten(booleanQuery, reader, flatQueries);
|
||||
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) );
|
||||
}
|
||||
|
||||
|
@ -56,7 +60,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
Query query = dmq( tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
|
||||
FieldQuery fq = new FieldQuery( query, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten( query, flatQueries );
|
||||
fq.flatten( query, reader, flatQueries );
|
||||
assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), pqF( "C", "D" ) );
|
||||
}
|
||||
|
||||
|
@ -70,7 +74,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
|
||||
FieldQuery fq = new FieldQuery(booleanQuery, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten(booleanQuery, flatQueries);
|
||||
fq.flatten(booleanQuery, reader, flatQueries);
|
||||
assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) );
|
||||
}
|
||||
|
||||
|
@ -82,7 +86,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
|
||||
FieldQuery fq = new FieldQuery( query, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten( query, flatQueries );
|
||||
fq.flatten( query, reader, flatQueries );
|
||||
assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) );
|
||||
}
|
||||
|
||||
|
@ -90,7 +94,7 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
Query query = pqF( "A" );
|
||||
FieldQuery fq = new FieldQuery( query, true, true );
|
||||
Set<Query> flatQueries = new HashSet<Query>();
|
||||
fq.flatten( query, flatQueries );
|
||||
fq.flatten( query, reader, flatQueries );
|
||||
assertCollectionQueries( flatQueries, tq( "A" ) );
|
||||
}
|
||||
|
||||
|
@ -869,4 +873,36 @@ public class FieldQueryTest extends AbstractTestCase {
|
|||
phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) );
|
||||
assertNull( fq.searchPhrase( F, phraseCandidate ) );
|
||||
}
|
||||
|
||||
public void testHighlightQuery() throws Exception {
|
||||
makeIndexStrMV();
|
||||
defgMultiTermQueryTest(new WildcardQuery(new Term(F, "d*g")));
|
||||
}
|
||||
|
||||
public void testPrefixQuery() throws Exception {
|
||||
makeIndexStrMV();
|
||||
defgMultiTermQueryTest(new PrefixQuery(new Term(F, "de")));
|
||||
}
|
||||
|
||||
public void testRegexpQuery() throws Exception {
|
||||
makeIndexStrMV();
|
||||
Term term = new Term(F, "d[a-z].g");
|
||||
defgMultiTermQueryTest(new RegexpQuery (term));
|
||||
}
|
||||
|
||||
public void testRangeQuery() throws Exception {
|
||||
makeIndexStrMV();
|
||||
defgMultiTermQueryTest(new TermRangeQuery (F, new BytesRef("d"), new BytesRef("e"), true, true));
|
||||
}
|
||||
|
||||
private void defgMultiTermQueryTest(Query query) throws IOException {
|
||||
FieldQuery fq = new FieldQuery( query, reader, true, true );
|
||||
QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg");
|
||||
assertNotNull (qpm);
|
||||
assertNull (fq.getFieldTermMap(F, "dog"));
|
||||
List<TermInfo> phraseCandidate = new ArrayList<TermInfo>();
|
||||
phraseCandidate.add( new TermInfo( "defg", 0, 12, 0 ) );
|
||||
assertNotNull (fq.searchPhrase(F, phraseCandidate));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,8 +16,10 @@ package org.apache.lucene.search.vectorhighlight;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
|
||||
public class FieldTermStackTest extends AbstractTestCase {
|
||||
|
||||
|
@ -158,4 +160,17 @@ public class FieldTermStackTest extends AbstractTestCase {
|
|||
assertEquals( "ee(90,92,63)", stack.pop().toString() );
|
||||
assertEquals( "ed(91,93,64)", stack.pop().toString() );
|
||||
}
|
||||
|
||||
|
||||
public void testWildcard() throws Exception {
|
||||
makeIndexLongMV();
|
||||
FieldQuery fq = new FieldQuery( new WildcardQuery (new Term(F, "th*e")), reader, true, true );
|
||||
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
|
||||
assertEquals (4, stack.termList.size());
|
||||
assertEquals ("the(15,18,2)", stack.pop().toString());
|
||||
assertEquals ("these(133,138,20)", stack.pop().toString());
|
||||
assertEquals ("the(153,156,23)", stack.pop().toString());
|
||||
assertEquals ("the(195,198,31)", stack.pop().toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -95,11 +95,12 @@ public class SearchTravRetVectorHighlightTask extends SearchTravTask {
|
|||
@Override
|
||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
||||
highlighter = new FastVectorHighlighter( false, false );
|
||||
final FieldQuery fq = highlighter.getFieldQuery( q );
|
||||
final Query myq = q;
|
||||
return new BenchmarkHighlighter(){
|
||||
@Override
|
||||
public int doHighlight(IndexReader reader, int doc, String field,
|
||||
Document document, Analyzer analyzer, String text) throws Exception {
|
||||
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
|
||||
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
|
||||
return fragments != null ? fragments.length : 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue