diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 31080ef6c57..53f06ac7871 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -83,6 +83,8 @@ New Features SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi) + * LUCENE-1889: Add MultiTermQuery support for FVH. (Mike Sokolov via Koji Sekiguchi) + Bug Fixes * LUCENE-3417: DictionaryCompoundWordFilter did not properly add tokens from the diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java index bdf84d1cd60..54072b92520 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java @@ -76,8 +76,23 @@ public class FastVectorHighlighter { * @param query a query * @return the created {@link FieldQuery} object */ - public FieldQuery getFieldQuery( Query query ){ - return new FieldQuery( query, phraseHighlight, fieldMatch ); + public FieldQuery getFieldQuery( Query query ) { + try { + return new FieldQuery( query, null, phraseHighlight, fieldMatch ); + } catch (IOException e) { + // should never be thrown when reader is null + throw new RuntimeException (e); + } + } + + /** + * create a {@link FieldQuery} object. + * + * @param query a query + * @return the created {@link FieldQuery} object + */ + public FieldQuery getFieldQuery( Query query, IndexReader reader ) throws IOException { + return new FieldQuery( query, reader, phraseHighlight, fieldMatch ); } /** diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java index 0d02cd9c97e..7df3635d530 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java @@ -16,6 +16,7 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ +import java.io.IOException; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -24,10 +25,12 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; @@ -51,16 +54,19 @@ public class FieldQuery { int termOrPhraseNumber; // used for colored tag support - FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){ + // The maximum number of different matching terms accumulated from any one MultiTermQuery + private static final int MAX_MTQ_TERMS = 1024; + + FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this.fieldMatch = fieldMatch; Set flatQueries = new HashSet(); - flatten( query, flatQueries ); - saveTerms( flatQueries ); + flatten( query, reader, flatQueries ); + saveTerms( flatQueries, reader ); Collection expandQueries = expand( flatQueries ); for( Query flatQuery : expandQueries ){ QueryPhraseMap rootMap = getRootMap( flatQuery ); - rootMap.add( flatQuery ); + rootMap.add( flatQuery, reader ); if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)flatQuery; if( pq.getTerms().length > 1 ){ @@ -71,24 +77,37 @@ public class FieldQuery { } } - void flatten( Query sourceQuery, Collection flatQueries ){ + /** For backwards compatibility you can initialize FieldQuery without + * an IndexReader, which is only required to support MultiTermQuery + */ + FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException { + this (query, null, phraseHighlight, fieldMatch); + } + + void flatten( Query sourceQuery, IndexReader reader, Collection flatQueries ) throws IOException{ if( sourceQuery instanceof BooleanQuery ){ BooleanQuery bq = (BooleanQuery)sourceQuery; for( BooleanClause clause : bq.getClauses() ){ if( !clause.isProhibited() ) - flatten( clause.getQuery(), flatQueries ); + flatten( clause.getQuery(), reader, flatQueries ); } } else if( sourceQuery instanceof DisjunctionMaxQuery ){ DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; for( Query query : dmq ){ - flatten( query, flatQueries ); + flatten( query, reader, flatQueries ); } } else if( sourceQuery instanceof TermQuery ){ if( !flatQueries.contains( sourceQuery ) ) flatQueries.add( sourceQuery ); } + else if (sourceQuery instanceof MultiTermQuery) { + MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone(); + copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS)); + BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader); + flatten(mtqTerms, reader, flatQueries); + } else if( sourceQuery instanceof PhraseQuery ){ if( !flatQueries.contains( sourceQuery ) ){ PhraseQuery pq = (PhraseQuery)sourceQuery; @@ -207,6 +226,9 @@ public class FieldQuery { Term[] terms = pq.getTerms(); return terms[0].field(); } + else if (query instanceof MultiTermQuery) { + return ((MultiTermQuery)query).getField(); + } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } @@ -233,7 +255,7 @@ public class FieldQuery { * - fieldMatch==false * termSetMap=Map> */ - void saveTerms( Collection flatQueries ){ + void saveTerms( Collection flatQueries, IndexReader reader ) throws IOException{ for( Query query : flatQueries ){ Set termSet = getTermSet( query ); if( query instanceof TermQuery ) @@ -242,6 +264,12 @@ public class FieldQuery { for( Term term : ((PhraseQuery)query).getTerms() ) termSet.add( term.text() ); } + else if (query instanceof MultiTermQuery && reader != null) { + BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader); + for (BooleanClause clause : mtqTerms.getClauses()) { + termSet.add (((TermQuery) clause.getQuery()).getTerm().text()); + } + } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } @@ -319,7 +347,7 @@ public class FieldQuery { return map; } - void add( Query query ){ + void add( Query query, IndexReader reader ) throws IOException { if( query instanceof TermQuery ){ addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); } diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 93f713e9782..ede19b1f381 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -72,6 +72,10 @@ public class FieldTermStack { public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException { this.fieldName = fieldName; + Set termSet = fieldQuery.getTermSet( fieldName ); + // just return to make null snippet if un-matched fieldName specified when fieldMatch == true + if( termSet == null ) return; + TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName ); if( tfv == null ) return; // just return to make null snippets TermPositionVector tpv = null; @@ -82,9 +86,6 @@ public class FieldTermStack { return; // just return to make null snippets } - Set termSet = fieldQuery.getTermSet( fieldName ); - // just return to make null snippet if un-matched fieldName specified when fieldMatch == true - if( termSet == null ) return; final CharsRef spare = new CharsRef(); for( BytesRef term : tpv.getTerms() ){ if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue; diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html index 378ea588ad9..8afb201f349 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html @@ -24,6 +24,7 @@ This is an another highlighter implementation.
  • fast for large docs
  • support N-gram fields
  • support phrase-unit highlighting with slops
  • +
  • support multi-term (includes wildcard, range, regexp, etc) queries
  • need Java 1.5
  • highlight fields need to be stored with Positions and Offsets
  • take into account query boost to score fragments
  • diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java index 74d2d9919f1..c6aec47704e 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java @@ -16,19 +16,23 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; import org.apache.lucene.util.BytesRef; @@ -48,7 +52,7 @@ public class FieldQueryTest extends AbstractTestCase { FieldQuery fq = new FieldQuery(booleanQuery, true, true ); Set flatQueries = new HashSet(); - fq.flatten(booleanQuery, flatQueries); + fq.flatten(booleanQuery, reader, flatQueries); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) ); } @@ -56,7 +60,7 @@ public class FieldQueryTest extends AbstractTestCase { Query query = dmq( tq( "A" ), tq( "B" ), pqF( "C", "D" ) ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); - fq.flatten( query, flatQueries ); + fq.flatten( query, reader, flatQueries ); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), pqF( "C", "D" ) ); } @@ -70,7 +74,7 @@ public class FieldQueryTest extends AbstractTestCase { FieldQuery fq = new FieldQuery(booleanQuery, true, true ); Set flatQueries = new HashSet(); - fq.flatten(booleanQuery, flatQueries); + fq.flatten(booleanQuery, reader, flatQueries); assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) ); } @@ -82,7 +86,7 @@ public class FieldQueryTest extends AbstractTestCase { FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); - fq.flatten( query, flatQueries ); + fq.flatten( query, reader, flatQueries ); assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) ); } @@ -90,7 +94,7 @@ public class FieldQueryTest extends AbstractTestCase { Query query = pqF( "A" ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); - fq.flatten( query, flatQueries ); + fq.flatten( query, reader, flatQueries ); assertCollectionQueries( flatQueries, tq( "A" ) ); } @@ -869,4 +873,36 @@ public class FieldQueryTest extends AbstractTestCase { phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); } + + public void testHighlightQuery() throws Exception { + makeIndexStrMV(); + defgMultiTermQueryTest(new WildcardQuery(new Term(F, "d*g"))); + } + + public void testPrefixQuery() throws Exception { + makeIndexStrMV(); + defgMultiTermQueryTest(new PrefixQuery(new Term(F, "de"))); + } + + public void testRegexpQuery() throws Exception { + makeIndexStrMV(); + Term term = new Term(F, "d[a-z].g"); + defgMultiTermQueryTest(new RegexpQuery (term)); + } + + public void testRangeQuery() throws Exception { + makeIndexStrMV(); + defgMultiTermQueryTest(new TermRangeQuery (F, new BytesRef("d"), new BytesRef("e"), true, true)); + } + + private void defgMultiTermQueryTest(Query query) throws IOException { + FieldQuery fq = new FieldQuery( query, reader, true, true ); + QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg"); + assertNotNull (qpm); + assertNull (fq.getFieldTermMap(F, "dog")); + List phraseCandidate = new ArrayList(); + phraseCandidate.add( new TermInfo( "defg", 0, 12, 0 ) ); + assertNotNull (fq.searchPhrase(F, phraseCandidate)); + } + } diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java index e59bc41a082..68402b4d6b2 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java @@ -16,8 +16,10 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.WildcardQuery; public class FieldTermStackTest extends AbstractTestCase { @@ -158,4 +160,17 @@ public class FieldTermStackTest extends AbstractTestCase { assertEquals( "ee(90,92,63)", stack.pop().toString() ); assertEquals( "ed(91,93,64)", stack.pop().toString() ); } + + + public void testWildcard() throws Exception { + makeIndexLongMV(); + FieldQuery fq = new FieldQuery( new WildcardQuery (new Term(F, "th*e")), reader, true, true ); + FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); + assertEquals (4, stack.termList.size()); + assertEquals ("the(15,18,2)", stack.pop().toString()); + assertEquals ("these(133,138,20)", stack.pop().toString()); + assertEquals ("the(153,156,23)", stack.pop().toString()); + assertEquals ("the(195,198,31)", stack.pop().toString()); + } + } diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java index 19570afdd35..edd31b8e25d 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java @@ -95,11 +95,12 @@ public class SearchTravRetVectorHighlightTask extends SearchTravTask { @Override protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ highlighter = new FastVectorHighlighter( false, false ); - final FieldQuery fq = highlighter.getFieldQuery( q ); + final Query myq = q; return new BenchmarkHighlighter(){ @Override public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { + final FieldQuery fq = highlighter.getFieldQuery( myq, reader); String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags); return fragments != null ? fragments.length : 0; }