mirror of https://github.com/apache/lucene.git
Add ConstantScore highlighting support to SpanScorer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@763856 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d74d56d9c9
commit
a1b3fd7240
|
@ -9,6 +9,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
|
||||
|
@ -38,7 +39,25 @@ public class SpanScorer implements Scorer {
|
|||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter) throws IOException {
|
||||
init(query, field, cachingTokenFilter, null);
|
||||
init(query, field, cachingTokenFilter, null, false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param query
|
||||
* Query to use for highlighting
|
||||
* @param field
|
||||
* Field to highlight - pass null to ignore fields
|
||||
* @param tokenStream
|
||||
* of source text to be highlighted
|
||||
* @param expandMultiTermQuery
|
||||
* rewrite multi-term queries against a single doc memory index to
|
||||
* create boolean queries
|
||||
* @throws IOException
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, boolean expandMultiTermQuery) throws IOException {
|
||||
init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -54,7 +73,26 @@ public class SpanScorer implements Scorer {
|
|||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader)
|
||||
throws IOException {
|
||||
init(query, field, cachingTokenFilter, reader);
|
||||
init(query, field, cachingTokenFilter, reader, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param query
|
||||
* Query to use for highlighting
|
||||
* @param field
|
||||
* Field to highlight - pass null to ignore fields
|
||||
* @param tokenStream
|
||||
* of source text to be highlighted
|
||||
* @param reader
|
||||
* @param expandMultiTermQuery
|
||||
* rewrite multi-term queries against a single doc memory index to
|
||||
* create boolean queries
|
||||
* @throws IOException
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
|
||||
throws IOException {
|
||||
init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -64,7 +102,17 @@ public class SpanScorer implements Scorer {
|
|||
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField)
|
||||
throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, reader);
|
||||
init(query, field, cachingTokenFilter, reader, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* As above, but with ability to pass in an <tt>IndexReader</tt>
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField, boolean expandMultiTermQuery)
|
||||
throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -73,7 +121,16 @@ public class SpanScorer implements Scorer {
|
|||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, null);
|
||||
init(query, field, cachingTokenFilter, null, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param defaultField - The default field for queries with the field name unspecified
|
||||
*/
|
||||
public SpanScorer(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, String defaultField, boolean expandMultiTermQuery) throws IOException {
|
||||
this.defaultField = defaultField.intern();
|
||||
init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -165,13 +222,13 @@ public class SpanScorer implements Scorer {
|
|||
* @throws IOException
|
||||
*/
|
||||
private void init(Query query, String field,
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader)
|
||||
CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
|
||||
throws IOException {
|
||||
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
|
||||
: new WeightedSpanTermExtractor(defaultField);
|
||||
|
||||
qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery);
|
||||
|
||||
qse.setExpandMultiTermQuery(expandMultiTermQuery);
|
||||
if (reader == null) {
|
||||
this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
|
||||
cachingTokenFilter, field);
|
||||
|
@ -183,6 +240,8 @@ public class SpanScorer implements Scorer {
|
|||
|
||||
/**
|
||||
* @return whether ConstantScoreRangeQuerys are set to be highlighted
|
||||
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
|
||||
* constructor option to expand MultiTerm queries.
|
||||
*/
|
||||
public static boolean isHighlightCnstScrRngQuery() {
|
||||
return highlightCnstScrRngQuery;
|
||||
|
@ -197,10 +256,13 @@ public class SpanScorer implements Scorer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be
|
||||
* highlighted if you rewrite the query first. Must be called before SpanScorer construction.
|
||||
* Turns highlighting of ConstantScoreRangeQuery on/off.
|
||||
* ConstantScoreRangeQuerys cannot be highlighted if you rewrite the query
|
||||
* first. Must be called before SpanScorer construction.
|
||||
*
|
||||
* @param highlightCnstScrRngQuery
|
||||
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
|
||||
* constructor option to expand MultiTerm queries.
|
||||
*/
|
||||
public static void setHighlightCnstScrRngQuery(boolean highlight) {
|
||||
highlightCnstScrRngQuery = highlight;
|
||||
|
|
|
@ -38,11 +38,16 @@ import org.apache.lucene.search.BooleanQuery;
|
|||
import org.apache.lucene.search.ConstantScoreRangeQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
|
@ -59,6 +64,7 @@ public class WeightedSpanTermExtractor {
|
|||
private Map readers = new HashMap(10); // Map<String, IndexReader>
|
||||
private String defaultField;
|
||||
private boolean highlightCnstScrRngQuery;
|
||||
private boolean expandMultiTermQuery;
|
||||
|
||||
public WeightedSpanTermExtractor() {
|
||||
}
|
||||
|
@ -131,6 +137,14 @@ public class WeightedSpanTermExtractor {
|
|||
extract((Query) iterator.next(), disjunctTerms);
|
||||
}
|
||||
terms.putAll(disjunctTerms);
|
||||
} else if (query instanceof MultiTermQuery && (highlightCnstScrRngQuery || expandMultiTermQuery)) {
|
||||
MultiTermQuery mtq = ((MultiTermQuery)query);
|
||||
if(mtq.getConstantScoreRewrite()) {
|
||||
query = copyMultiTermQuery(mtq);
|
||||
mtq.setConstantScoreRewrite(false);
|
||||
}
|
||||
IndexReader ir = getReaderForField(fieldName);
|
||||
extract(query.rewrite(ir), terms);
|
||||
} else if (query instanceof MultiPhraseQuery) {
|
||||
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
|
||||
final List termArrays = mpq.getTermArrays();
|
||||
|
@ -179,26 +193,6 @@ public class WeightedSpanTermExtractor {
|
|||
sp.setBoost(query.getBoost());
|
||||
extractWeightedSpanTerms(terms, sp);
|
||||
}
|
||||
} else if (highlightCnstScrRngQuery && query instanceof ConstantScoreRangeQuery) {
|
||||
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
|
||||
Term lower = new Term(fieldName, q.getLowerVal());
|
||||
Term upper = new Term(fieldName, q.getUpperVal());
|
||||
FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
|
||||
try {
|
||||
TermEnum te = fir.terms(lower);
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
do {
|
||||
Term term = te.term();
|
||||
if (term != null && upper.compareTo(term) >= 0) {
|
||||
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (te.next());
|
||||
extract(bq, terms);
|
||||
} finally {
|
||||
fir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -425,10 +419,19 @@ public class WeightedSpanTermExtractor {
|
|||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use
|
||||
* getExpandMultiTermQuery instead.
|
||||
*/
|
||||
public boolean isHighlightCnstScrRngQuery() {
|
||||
return highlightCnstScrRngQuery;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param highlightCnstScrRngQuery
|
||||
* @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
|
||||
* setExpandMultiTermQuery option.
|
||||
*/
|
||||
public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
|
||||
this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
|
||||
}
|
||||
|
@ -460,4 +463,35 @@ public class WeightedSpanTermExtractor {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
private Query copyMultiTermQuery(MultiTermQuery query) {
|
||||
if(query instanceof RangeQuery) {
|
||||
RangeQuery q = (RangeQuery)query;
|
||||
q.setBoost(query.getBoost());
|
||||
return new RangeQuery(q.getField(), q.getLowerTermText(), q.getUpperTermText(), q.includesLower(), q.includesUpper());
|
||||
} else if(query instanceof WildcardQuery) {
|
||||
Query q = new WildcardQuery(query.getTerm());
|
||||
q.setBoost(query.getBoost());
|
||||
return q;
|
||||
} else if(query instanceof PrefixQuery) {
|
||||
Query q = new PrefixQuery(query.getTerm());
|
||||
q.setBoost(q.getBoost());
|
||||
return q;
|
||||
} else if(query instanceof FuzzyQuery) {
|
||||
FuzzyQuery q = (FuzzyQuery)query;
|
||||
q.setBoost(q.getBoost());
|
||||
return new FuzzyQuery(q.getTerm(), q.getMinSimilarity(), q.getPrefixLength());
|
||||
}
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
|
||||
public boolean getExpandMultiTermQuery() {
|
||||
return expandMultiTermQuery;
|
||||
}
|
||||
|
||||
public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
|
||||
this.expandMultiTermQuery = expandMultiTermQuery;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,12 +16,19 @@ be highlighted if the sub-query is position sensitive. The start and end positio
|
|||
matching Spans are recorded with the respective WeightedSpanTerms and these positions are
|
||||
then used to filter possible Token matches during scoring.
|
||||
</p>
|
||||
<p>
|
||||
Unlike the QueryScorer, you do not want to rewrite the query first with the SpanScorer for
|
||||
multi term query handling ie wildcard, fuzzy, range.
|
||||
The SpanScorer constructors provide an option to enable the highlighting of multi-term queries.
|
||||
If this option is enabled, the SpanScorer will rewrite the query against a single doc index
|
||||
containing the doc to be highlighted, rather than against the full index. If you do rewrite the
|
||||
query first, certain multi-term queries may not highlight correctly.
|
||||
</p>
|
||||
<h2>Example Usage</h2>
|
||||
|
||||
<pre>
|
||||
IndexSearcher searcher = new IndexSearcher(ramDir);
|
||||
Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
|
||||
query = query.rewrite(reader); //required to expand search terms
|
||||
Hits hits = searcher.search(query);
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
|
@ -29,7 +36,7 @@ then used to filter possible Token matches during scoring.
|
|||
String text = hits.doc(i).get(FIELD_NAME);
|
||||
CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
|
||||
FIELD_NAME, new StringReader(text)));
|
||||
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
|
||||
Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream, true));
|
||||
tokenStream.reset();
|
||||
|
||||
// Get 3 best fragments and seperate with a "..."
|
||||
|
|
|
@ -63,6 +63,7 @@ import org.apache.lucene.search.RangeFilter;
|
|||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
|
@ -437,7 +438,7 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
public void run() throws Exception {
|
||||
numHighlights = 0;
|
||||
doSearching("Kinnedy~");
|
||||
doStandardHighlights(analyzer, hits, query, HighlighterTest.this);
|
||||
doStandardHighlights(analyzer, hits, query, HighlighterTest.this, true);
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||
numHighlights == 5);
|
||||
}
|
||||
|
@ -540,6 +541,45 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
numHighlights == 5);
|
||||
}
|
||||
|
||||
public void testConstantScoreMultiTermQuery() throws Exception {
|
||||
|
||||
numHighlights = 0;
|
||||
|
||||
query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
|
||||
((WildcardQuery)query).setConstantScoreRewrite(true);
|
||||
searcher = new IndexSearcher(ramDir);
|
||||
// can't rewrite ConstantScore if you want to highlight it -
|
||||
// it rewrites to ConstantScoreQuery which cannot be highlighted
|
||||
// query = unReWrittenQuery.rewrite(reader);
|
||||
System.out.println("Searching for: " + query.toString(FIELD_NAME));
|
||||
hits = searcher.search(query);
|
||||
|
||||
for (int i = 0; i < hits.length(); i++) {
|
||||
String text = hits.doc(i).get(HighlighterTest.FIELD_NAME);
|
||||
int maxNumFragmentsRequired = 2;
|
||||
String fragmentSeparator = "...";
|
||||
SpanScorer scorer = null;
|
||||
TokenStream tokenStream = null;
|
||||
|
||||
tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
|
||||
new StringReader(text)));
|
||||
|
||||
scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME, (CachingTokenFilter) tokenStream, true);
|
||||
|
||||
Highlighter highlighter = new Highlighter(this, scorer);
|
||||
|
||||
((CachingTokenFilter) tokenStream).reset();
|
||||
|
||||
highlighter.setTextFragmenter(new SimpleFragmenter(20));
|
||||
|
||||
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
|
||||
fragmentSeparator);
|
||||
System.out.println("\t" + result);
|
||||
}
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||
numHighlights == 5);
|
||||
}
|
||||
|
||||
public void testGetBestFragmentsPhrase() throws Exception {
|
||||
TestHighlightRunner helper = new TestHighlightRunner() {
|
||||
|
||||
|
@ -1565,6 +1605,11 @@ class SynonymTokenizer extends TokenStream {
|
|||
}
|
||||
|
||||
void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter)
|
||||
throws Exception {
|
||||
doStandardHighlights(analyzer, hits, query, formatter, false);
|
||||
}
|
||||
|
||||
void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter, boolean expandMT)
|
||||
throws Exception {
|
||||
|
||||
for (int i = 0; i < hits.length(); i++) {
|
||||
|
@ -1577,7 +1622,7 @@ class SynonymTokenizer extends TokenStream {
|
|||
tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
|
||||
new StringReader(text)));
|
||||
scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME,
|
||||
(CachingTokenFilter) tokenStream);
|
||||
(CachingTokenFilter) tokenStream, expandMT);
|
||||
} else if (mode == STANDARD) {
|
||||
scorer = new QueryScorer(query);
|
||||
tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
|
||||
|
|
Loading…
Reference in New Issue