mirror of https://github.com/apache/lucene.git
SOLR-553 Use SpanScorer when highlighting phrase terms and hl.usePhraseHighlighter=true
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@659664 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5ba95af1bb
commit
cacf6ae295
|
@ -409,7 +409,14 @@ Bug Fixes
|
|||
|
||||
31. SOLR-514: Added explicit media-type with UTF* charset to *.xsl files that
|
||||
don't already have one. (hossman)
|
||||
|
||||
|
||||
32. SOLR-505: Give RequestHandlers the possiblity to suppress the generation
|
||||
of HTTP caching headers. (Thomas Peuss via Otis Gospodnetic)
|
||||
|
||||
33. SOLR-553: Handle highlighting of phrase terms better when
|
||||
hl.usePhraseHighligher=true URL param is used.
|
||||
(Bojan Smid via Otis Gospodnetic)
|
||||
|
||||
Other Changes
|
||||
1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
|
||||
build scripts to make two jars: apache-solr-1.3.jar and
|
||||
|
|
|
@ -33,6 +33,8 @@ public interface HighlightParams {
|
|||
public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
|
||||
public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";
|
||||
public static final String ALTERNATE_FIELD_LENGTH = HIGHLIGHT+".maxAlternateFieldLength";
|
||||
|
||||
public static final String USE_PHRASE_HIGHLIGHTER = HIGHLIGHT+".usePhraseHighlighter";
|
||||
|
||||
public static final String MERGE_CONTIGUOUS_FRAGMENTS = HIGHLIGHT + ".mergeContiguous";
|
||||
// Formatter
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.logging.Logger;
|
|||
import javax.xml.xpath.XPathConstants;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -41,6 +42,7 @@ import org.apache.lucene.search.highlight.Formatter;
|
|||
import org.apache.lucene.search.highlight.Fragmenter;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SpanScorer;
|
||||
import org.apache.lucene.search.highlight.TextFragment;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -55,7 +57,6 @@ import org.apache.solr.schema.SchemaField;
|
|||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.SolrPluginUtils;
|
||||
import org.apache.solr.util.plugin.NamedListPluginLoader;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
|
@ -92,6 +93,27 @@ public class DefaultSolrHighlighter extends SolrHighlighter
|
|||
formatters.put( null, fmt );
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a phrase Highlighter appropriate for this field.
|
||||
* @param query The current Query
|
||||
* @param fieldName The name of the field
|
||||
* @param request The current SolrQueryRequest
|
||||
* @param tokenStream document text CachingTokenStream
|
||||
* @throws IOException
|
||||
*/
|
||||
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
|
||||
SolrParams params = request.getParams();
|
||||
Highlighter highlighter = null;
|
||||
|
||||
highlighter = new Highlighter(getFormatter(fieldName, params), getSpanQueryScorer(query, fieldName, tokenStream, request));
|
||||
|
||||
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
|
||||
highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
|
||||
fieldName, HighlightParams.MAX_CHARS,
|
||||
Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
|
||||
|
||||
return highlighter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a Highlighter appropriate for this field.
|
||||
|
@ -111,6 +133,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter
|
|||
return highlighter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a SpanScorer suitable for this Query and field.
|
||||
* @param query The current query
|
||||
* @param tokenStream document text CachingTokenStream
|
||||
* @param fieldName The name of the field
|
||||
* @param request The SolrQueryRequest
|
||||
* @throws IOException
|
||||
*/
|
||||
private SpanScorer getSpanQueryScorer(Query query, String fieldName, CachingTokenFilter tokenStream, SolrQueryRequest request) throws IOException {
|
||||
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
|
||||
if (reqFieldMatch) {
|
||||
return new SpanScorer(query, fieldName, tokenStream);
|
||||
}
|
||||
else {
|
||||
return new SpanScorer(query, null, tokenStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a QueryScorer suitable for this Query and field.
|
||||
* @param query The current query
|
||||
|
@ -230,32 +270,59 @@ public class DefaultSolrHighlighter extends SolrHighlighter
|
|||
fieldName = fieldName.trim();
|
||||
String[] docTexts = doc.getValues(fieldName);
|
||||
if (docTexts == null) continue;
|
||||
|
||||
TokenStream tstream = null;
|
||||
|
||||
// create TokenStream
|
||||
if (docTexts.length == 1) {
|
||||
// single-valued field
|
||||
try {
|
||||
// attempt term vectors
|
||||
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
// fall back to anaylzer
|
||||
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// multi-valued field
|
||||
tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
|
||||
}
|
||||
|
||||
Highlighter highlighter;
|
||||
|
||||
if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
|
||||
// wrap CachingTokenFilter around TokenStream for reuse
|
||||
tstream = new CachingTokenFilter(tstream);
|
||||
|
||||
// get highlighter
|
||||
highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
|
||||
|
||||
// after highlighter initialization, reset tstream since construction of highlighter already used it
|
||||
tstream.reset();
|
||||
}
|
||||
else {
|
||||
// use "the old way"
|
||||
highlighter = getHighlighter(query, fieldName, req);
|
||||
}
|
||||
|
||||
// get highlighter, and number of fragments for this field
|
||||
Highlighter highlighter = getHighlighter(query, fieldName, req);
|
||||
int numFragments = getMaxSnippets(fieldName, params);
|
||||
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
|
||||
|
||||
String[] summaries = null;
|
||||
TextFragment[] frag;
|
||||
if (docTexts.length == 1) {
|
||||
// single-valued field
|
||||
TokenStream tstream;
|
||||
try {
|
||||
// attempt term vectors
|
||||
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
// fall back to analyzer
|
||||
tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
|
||||
}
|
||||
frag = highlighter.getBestTextFragments(tstream, docTexts[0], mergeContiguousFragments, numFragments);
|
||||
}
|
||||
else {
|
||||
// multi-valued field
|
||||
MultiValueTokenStream tstream;
|
||||
tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
|
||||
frag = highlighter.getBestTextFragments(tstream, tstream.asSingleValue(), false, numFragments);
|
||||
StringBuilder singleValue = new StringBuilder();
|
||||
|
||||
for (String txt:docTexts) {
|
||||
singleValue.append(txt);
|
||||
}
|
||||
|
||||
frag = highlighter.getBestTextFragments(tstream, singleValue.toString(), false, numFragments);
|
||||
}
|
||||
// convert fragments back into text
|
||||
// TODO: we can include score and position information in output as snippet attributes
|
||||
|
@ -303,12 +370,8 @@ public class DefaultSolrHighlighter extends SolrHighlighter
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Helper class which creates a single TokenStream out of values from a
|
||||
* multi-valued field.
|
||||
* Creates a single TokenStream out multi-value field values.
|
||||
*/
|
||||
class MultiValueTokenStream extends TokenStream {
|
||||
private String fieldName;
|
||||
|
@ -378,7 +441,6 @@ class MultiValueTokenStream extends TokenStream {
|
|||
sb.append(str);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -424,5 +486,3 @@ class TokenOrderingFilter extends TokenFilter {
|
|||
return queue.isEmpty() ? null : queue.removeFirst();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -481,4 +481,59 @@ public class HighlighterTest extends AbstractSolrTestCase {
|
|||
"//lst[@name='highlighting']/lst[@name='1']/arr[@name='t_text']/str[.='a piece of text']"
|
||||
);
|
||||
}
|
||||
|
||||
public void testPhraseHighlighter() {
|
||||
HashMap<String,String> args = new HashMap<String,String>();
|
||||
args.put("hl", "true");
|
||||
args.put("hl.fl", "t_text");
|
||||
args.put("hl.fragsize", "40");
|
||||
args.put("hl.snippets", "10");
|
||||
|
||||
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
|
||||
"standard", 0, 200, args);
|
||||
|
||||
// String borrowed from Lucene's HighlighterTest
|
||||
String t = "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy";
|
||||
|
||||
assertU(adoc("t_text", t, "id", "1"));
|
||||
assertU(commit());
|
||||
assertU(optimize());
|
||||
|
||||
String oldHighlight1 = "//lst[@name='1']/arr[@name='t_text']/str[.='This piece of <em>text</em> <em>refers</em> to Kennedy']";
|
||||
String oldHighlight2 = "//lst[@name='1']/arr[@name='t_text']/str[.=' at the beginning then has a longer piece of <em>text</em>']";
|
||||
String oldHighlight3 = "//lst[@name='1']/arr[@name='t_text']/str[.=' with another <em>reference</em> to Kennedy']";
|
||||
String newHighlight1 = "//lst[@name='1']/arr[@name='t_text']/str[.='This piece of <em>text</em> <em>refers</em> to Kennedy']";
|
||||
|
||||
// check if old functionality is still the same
|
||||
assertQ("Phrase highlighting - old",
|
||||
sumLRF.makeRequest("t_text:\"text refers\""),
|
||||
"//lst[@name='highlighting']/lst[@name='1']",
|
||||
oldHighlight1, oldHighlight2, oldHighlight3
|
||||
);
|
||||
|
||||
assertQ("Phrase highlighting - old",
|
||||
sumLRF.makeRequest("t_text:text refers"),
|
||||
"//lst[@name='highlighting']/lst[@name='1']",
|
||||
oldHighlight1, oldHighlight2, oldHighlight3
|
||||
);
|
||||
|
||||
// now check if Lucene-794 highlighting works as expected
|
||||
args.put("hl.usePhraseHighlighter", "true");
|
||||
|
||||
sumLRF = h.getRequestFactory("standard", 0, 200, args);
|
||||
|
||||
// check phrase highlighting
|
||||
assertQ("Phrase highlighting - Lucene-794",
|
||||
sumLRF.makeRequest("t_text:\"text refers\""),
|
||||
"//lst[@name='highlighting']/lst[@name='1']",
|
||||
newHighlight1
|
||||
);
|
||||
|
||||
// non phrase queries should be highlighted as they were before this fix
|
||||
assertQ("Phrase highlighting - Lucene-794",
|
||||
sumLRF.makeRequest("t_text:text refers"),
|
||||
"//lst[@name='highlighting']/lst[@name='1']",
|
||||
oldHighlight1, oldHighlight2, oldHighlight3
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue