Add ConstantScore highlighting support to SpanScorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@763856 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-04-10 02:09:46 +00:00
parent d74d56d9c9
commit a1b3fd7240
4 changed files with 182 additions and 34 deletions

View File

@ -9,6 +9,7 @@ import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.Query;
@ -38,7 +39,25 @@ public class SpanScorer implements Scorer {
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter) throws IOException {
init(query, field, cachingTokenFilter, null);
init(query, field, cachingTokenFilter, null, false);
}
/**
* @param query
* Query to use for highlighting
* @param field
* Field to highlight - pass null to ignore fields
* @param tokenStream
* of source text to be highlighted
* @param expandMultiTermQuery
* rewrite multi-term queries against a single doc memory index to
* create boolean queries
* @throws IOException
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, boolean expandMultiTermQuery) throws IOException {
init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
}
/**
@ -54,7 +73,26 @@ public class SpanScorer implements Scorer {
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader)
throws IOException {
init(query, field, cachingTokenFilter, reader);
init(query, field, cachingTokenFilter, reader, false);
}
/**
* @param query
* Query to use for highlighting
* @param field
* Field to highlight - pass null to ignore fields
* @param tokenStream
* of source text to be highlighted
* @param reader
* @param expandMultiTermQuery
* rewrite multi-term queries against a single doc memory index to
* create boolean queries
* @throws IOException
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
throws IOException {
init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
}
/**
@ -64,7 +102,17 @@ public class SpanScorer implements Scorer {
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField)
throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, reader);
init(query, field, cachingTokenFilter, reader, false);
}
/**
* As above, but with ability to pass in an <tt>IndexReader</tt>
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField, boolean expandMultiTermQuery)
throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
}
/**
@ -73,7 +121,16 @@ public class SpanScorer implements Scorer {
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, null);
init(query, field, cachingTokenFilter, null, false);
}
/**
* @param defaultField - The default field for queries with the field name unspecified
*/
public SpanScorer(Query query, String field,
CachingTokenFilter cachingTokenFilter, String defaultField, boolean expandMultiTermQuery) throws IOException {
this.defaultField = defaultField.intern();
init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
}
/**
@ -165,13 +222,13 @@ public class SpanScorer implements Scorer {
* @throws IOException
*/
private void init(Query query, String field,
CachingTokenFilter cachingTokenFilter, IndexReader reader)
CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
throws IOException {
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery);
qse.setExpandMultiTermQuery(expandMultiTermQuery);
if (reader == null) {
this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
cachingTokenFilter, field);
@ -183,6 +240,8 @@ public class SpanScorer implements Scorer {
/**
* @return whether ConstantScoreRangeQuerys are set to be highlighted
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
* constructor option to expand MultiTerm queries.
*/
public static boolean isHighlightCnstScrRngQuery() {
return highlightCnstScrRngQuery;
@ -197,10 +256,13 @@ public class SpanScorer implements Scorer {
}
/**
* Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be
* highlighted if you rewrite the query first. Must be called before SpanScorer construction.
* Turns highlighting of ConstantScoreRangeQuery on/off.
* ConstantScoreRangeQuerys cannot be highlighted if you rewrite the query
* first. Must be called before SpanScorer construction.
*
* @param highlightCnstScrRngQuery
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
* constructor option to expand MultiTerm queries.
*/
public static void setHighlightCnstScrRngQuery(boolean highlight) {
highlightCnstScrRngQuery = highlight;

View File

@ -38,11 +38,16 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
@ -59,6 +64,7 @@ public class WeightedSpanTermExtractor {
private Map readers = new HashMap(10); // Map<String, IndexReader>
private String defaultField;
private boolean highlightCnstScrRngQuery;
private boolean expandMultiTermQuery;
public WeightedSpanTermExtractor() {
}
@ -131,6 +137,14 @@ public class WeightedSpanTermExtractor {
extract((Query) iterator.next(), disjunctTerms);
}
terms.putAll(disjunctTerms);
} else if (query instanceof MultiTermQuery && (highlightCnstScrRngQuery || expandMultiTermQuery)) {
MultiTermQuery mtq = ((MultiTermQuery)query);
if(mtq.getConstantScoreRewrite()) {
query = copyMultiTermQuery(mtq);
mtq.setConstantScoreRewrite(false);
}
IndexReader ir = getReaderForField(fieldName);
extract(query.rewrite(ir), terms);
} else if (query instanceof MultiPhraseQuery) {
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
final List termArrays = mpq.getTermArrays();
@ -179,26 +193,6 @@ public class WeightedSpanTermExtractor {
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
}
} else if (highlightCnstScrRngQuery && query instanceof ConstantScoreRangeQuery) {
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
Term lower = new Term(fieldName, q.getLowerVal());
Term upper = new Term(fieldName, q.getUpperVal());
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
try {
TermEnum te = fir.terms(lower);
BooleanQuery bq = new BooleanQuery();
do {
Term term = te.term();
if (term != null && upper.compareTo(term) >= 0) {
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
} else {
break;
}
} while (te.next());
extract(bq, terms);
} finally {
fir.close();
}
}
}
@ -425,10 +419,19 @@ public class WeightedSpanTermExtractor {
return terms;
}
/**
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use
* getExpandMultiTermQuery instead.
*/
public boolean isHighlightCnstScrRngQuery() {
return highlightCnstScrRngQuery;
}
/**
* @param highlightCnstScrRngQuery
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
* setExpandMultiTermQuery option.
*/
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
}
@ -460,4 +463,35 @@ public class WeightedSpanTermExtractor {
}
}
private Query copyMultiTermQuery(MultiTermQuery query) {
if(query instanceof RangeQuery) {
RangeQuery q = (RangeQuery)query;
q.setBoost(query.getBoost());
return new RangeQuery(q.getField(), q.getLowerTermText(), q.getUpperTermText(), q.includesLower(), q.includesUpper());
} else if(query instanceof WildcardQuery) {
Query q = new WildcardQuery(query.getTerm());
q.setBoost(query.getBoost());
return q;
} else if(query instanceof PrefixQuery) {
Query q = new PrefixQuery(query.getTerm());
q.setBoost(q.getBoost());
return q;
} else if(query instanceof FuzzyQuery) {
FuzzyQuery q = (FuzzyQuery)query;
q.setBoost(q.getBoost());
return new FuzzyQuery(q.getTerm(), q.getMinSimilarity(), q.getPrefixLength());
}
return query;
}
public boolean getExpandMultiTermQuery() {
return expandMultiTermQuery;
}
public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
this.expandMultiTermQuery = expandMultiTermQuery;
}
}

View File

@ -16,12 +16,19 @@ be highlighted if the sub-query is position sensitive. The start and end positio
matching Spans are recorded with the respective WeightedSpanTerms and these positions are
then used to filter possible Token matches during scoring.
</p>
<p>
Unlike the QueryScorer, you do not want to rewrite the query first with the SpanScorer for
multi term query handling ie wildcard, fuzzy, range.
The SpanScorer constructors provide an option to enable the highlighting of multi-term queries.
If this option is enabled, the SpanScorer will rewrite the query against a single doc index
containing the doc to be highlighted, rather than against the full index. If you do rewrite the
query first, certain multi-term queries may not highlight correctly.
</p>
<h2>Example Usage</h2>
<pre>
IndexSearcher searcher = new IndexSearcher(ramDir);
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
query = query.rewrite(reader); //required to expand search terms
Hits hits = searcher.search(query);
for (int i = 0; i &lt; hits.length(); i++)
@ -29,7 +36,7 @@ then used to filter possible Token matches during scoring.
String text = hits.doc(i).get(FIELD_NAME);
CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
FIELD_NAME, new StringReader(text)));
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream, true));
tokenStream.reset();
// Get 3 best fragments and seperate with a "..."

View File

@ -63,6 +63,7 @@ import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
import org.apache.lucene.search.spans.SpanNearQuery;
@ -437,7 +438,7 @@ public class HighlighterTest extends TestCase implements Formatter {
public void run() throws Exception {
numHighlights = 0;
doSearching("Kinnedy~");
doStandardHighlights(analyzer, hits, query, HighlighterTest.this);
doStandardHighlights(analyzer, hits, query, HighlighterTest.this, true);
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 5);
}
@ -540,6 +541,45 @@ public class HighlighterTest extends TestCase implements Formatter {
numHighlights == 5);
}
public void testConstantScoreMultiTermQuery() throws Exception {
numHighlights = 0;
query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
((WildcardQuery)query).setConstantScoreRewrite(true);
searcher = new IndexSearcher(ramDir);
// can't rewrite ConstantScore if you want to highlight it -
// it rewrites to ConstantScoreQuery which cannot be highlighted
// query = unReWrittenQuery.rewrite(reader);
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
String text = hits.doc(i).get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
SpanScorer scorer = null;
TokenStream tokenStream = null;
tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
new StringReader(text)));
scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME, (CachingTokenFilter) tokenStream, true);
Highlighter highlighter = new Highlighter(this, scorer);
((CachingTokenFilter) tokenStream).reset();
highlighter.setTextFragmenter(new SimpleFragmenter(20));
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
fragmentSeparator);
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 5);
}
public void testGetBestFragmentsPhrase() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
@ -1565,6 +1605,11 @@ class SynonymTokenizer extends TokenStream {
}
void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter)
throws Exception {
doStandardHighlights(analyzer, hits, query, formatter, false);
}
void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter, boolean expandMT)
throws Exception {
for (int i = 0; i < hits.length(); i++) {
@ -1577,7 +1622,7 @@ class SynonymTokenizer extends TokenStream {
tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
new StringReader(text)));
scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME,
(CachingTokenFilter) tokenStream);
(CachingTokenFilter) tokenStream, expandMT);
} else if (mode == STANDARD) {
scorer = new QueryScorer(query);
tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));