SOLR-12754: New hl.weightMatches for UnifiedHighlighter WEIGHT_MATCHES

(defaults to true in master/8)
This commit is contained in:
David Smiley 2018-10-23 13:28:10 -04:00
parent 2e757f6c25
commit 3e89b7a771
5 changed files with 64 additions and 20 deletions

View File

@ -56,6 +56,9 @@ Upgrade Notes
"date.formats" configuration. To ensure date strings are properly parsed, use ParseDateFieldUpdateProcessorFactory
(an URP) commonly registered with the name "parse-date" in "schemaless mode". (David Smiley, Bar Rotstein)
* SOLR-12754: The UnifiedHighlighter hl.weightMatches now defaults to true. If there are unforseen highlight problems,
this may be the culprit.
New Features
----------------------
@ -154,6 +157,8 @@ New Features
* SOLR-5004: Splitshard collections API now supports splitting into more than 2 sub-shards directly i.e. by providing a
numSubShards parameter (Christine Poerschke, Anshum Gupta)
* SOLR-12754: The UnifiedHighlighter has a new hl.weightMatches param defaulting to false (will be true in 8.0). It's
the highest query accuracy mode, and furthermore phrase queries are highlighted as one. (David Smiley)
Other Changes
----------------------

View File

@ -19,6 +19,7 @@ package org.apache.solr.highlight;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -43,7 +44,6 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
@ -80,6 +80,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <bool name="hl.usePhraseHighlighter">true</bool>
* <int name="hl.cacheFieldValCharsThreshold">524288</int>
* <str name="hl.offsetSource"></str>
* <bool name="hl.weightMatches">true</bool>
* </lst>
* </requestHandler>
* </pre>
@ -109,6 +110,7 @@ import org.apache.solr.util.plugin.PluginInfoInitialized;
* <li>hl.usePhraseHighlighter (bool) enables phrase highlighting. default is true
* <li>hl.cacheFieldValCharsThreshold (int) controls how many characters from a field are cached. default is 524288 (1MB in 2 byte chars)
* <li>hl.offsetSource (string) specifies which offset source to use, prefers postings, but will use what's available if not specified
* <li>hl.weightMatches (bool) enables Lucene Weight Matches mode</li>
* </ul>
*
* @lucene.experimental
@ -241,12 +243,9 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
this.setCacheFieldValCharsThreshold(
params.getInt(HighlightParams.CACHE_FIELD_VAL_CHARS_THRESHOLD, DEFAULT_CACHE_CHARS_THRESHOLD));
// SolrRequestInfo is a thread-local singleton providing access to the ResponseBuilder to code that
// otherwise can't get it in a nicer way.
SolrQueryRequest request = SolrRequestInfo.getRequestInfo().getReq();
final RTimerTree timerTree;
if (request.getRequestTimer() != null) { //It may be null if not used in a search context.
timerTree = request.getRequestTimer();
if (req.getRequestTimer() != null) { //It may be null if not used in a search context.
timerTree = req.getRequestTimer();
} else {
timerTree = new RTimerTree(); // since null checks are annoying
}
@ -394,20 +393,28 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
}
@Override
protected boolean shouldHandleMultiTermQuery(String field) {
return params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, true);
}
protected Set<HighlightFlag> getFlags(String field) {
Set<HighlightFlag> flags = EnumSet.noneOf(HighlightFlag.class);
if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, true)) {
flags.add(HighlightFlag.MULTI_TERM_QUERY);
}
if (params.getFieldBool(field, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
flags.add(HighlightFlag.PHRASES);
}
flags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
@Override
protected boolean shouldHighlightPhrasesStrictly(String field) {
return params.getFieldBool(field, HighlightParams.USE_PHRASE_HIGHLIGHTER, true);
if (params.getFieldBool(field, HighlightParams.WEIGHT_MATCHES, true)
&& flags.contains(HighlightFlag.PHRASES) && flags.contains(HighlightFlag.MULTI_TERM_QUERY)) {
flags.add(HighlightFlag.WEIGHT_MATCHES);
}
return flags;
}
@Override
protected Predicate<String> getFieldMatcher(String field) {
// TODO define hl.queryFieldPattern as a more advanced alternative to hl.requireFieldMatch.
// note that the UH & PH at Lucene level default to effectively "true"
// note that the UH at Lucene level default to effectively "true"
if (params.getFieldBool(field, HighlightParams.FIELD_MATCH, false)) {
return field::equals; // requireFieldMatch
} else {

View File

@ -103,7 +103,7 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertQ("strict phrase handling",
req("q", "text:\"strict phrases\"", "sort", "id asc", "hl", "true"),
"count(//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/*)=1",
"//lst[@name='highlighting']/lst[@name='101']/arr/str[1]='<em>Strict</em> <em>phrases</em> should be enabled for phrases'");
"//lst[@name='highlighting']/lst[@name='101']/arr/str[1]='<em>Strict phrases</em> should be enabled for phrases'");
}
public void testStrictPhrasesCanBeDisabled() {
@ -291,5 +291,15 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertQ(req("q", "id:101", "hl", "true", "hl.q", "text:document", "hl.fl", "text3", "hl.requireFieldMatch", "true"),
"count(//lst[@name='highlighting']/lst[@name='101']/arr[@name='text3']/*)=0");
}
public void testWeightMatchesDisabled() {
clearIndex();
assertU(adoc("text", "alpha bravo charlie", "id", "101"));
assertU(commit());
assertQ("weight matches disabled, phrase highlights separately",
req("q", "text:\"alpha bravo\"", "hl", "true", "hl.weightMatches", "false"),
"count(//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/*)=1",
"//lst[@name='highlighting']/lst[@name='101']/arr/str[1]='<em>alpha</em> <em>bravo</em> charlie'");
}
}

View File

@ -147,19 +147,32 @@ There are many parameters supported by more than one highlighter, and sometimes
There are four highlighters available that can be chosen at runtime with the `hl.method` parameter, in order of general recommendation:
<<The Unified Highlighter,Unified Highlighter>>:: (`hl.method=unified`)
+
The Unified Highlighter is the newest highlighter (as of Solr 6.4), which stands out as the most flexible and performant of the options. We recommend that you try this highlighter even though it isn't the default (yet).
The Unified Highlighter is the newest highlighter (as of Solr 6.4), which stands out as the most performant and accurate of the options.
It can handle typical requirements and others possibly via plugins/extension.
We recommend that you try this highlighter even though it isn't the default (yet).
+
This highlighter supports the most common highlighting parameters and can handle just about any query accurately, even SpanQueries (e.g., as seen from the `surround` parser). A strong benefit to this highlighter is that you can opt to configure Solr to put more information in the underlying index to speed up highlighting of large documents; multiple configurations are supported, even on a per-field basis. There is little or no such flexibility for the other highlighters. More on this below.
The UH highlights a query very _accurately_ and thus is true to what the underlying Lucene query actually matches.
Other highlighters highlight terms more liberally (over-highlight).
A strong benefit to this highlighter is that you can opt to configure Solr to put more information in the underlying index to speed up highlighting of large documents; multiple configurations are supported, even on a per-field basis.
There is little or no such flexibility of offset sources for the other highlighters.
More on this below.
+
There are some reasons not to choose this highlighter: The `surround` query parser doesn't yet work here -- SOLR-12895.
Passage scoring does not consider boosts in the query.
Some people want more/better passage breaking flexibility.
<<The Original Highlighter,Original Highlighter>>:: (`hl.method=original`, the default)
+
The Original Highlighter, sometimes called the "Standard Highlighter" or "Default Highlighter", is Lucene's original highlighter a venerable option with a high degree of customization options. Its ability to highlight just about any query accurately is a strength shared with the Unified Highlighter (they share some code for this in fact).
The Original Highlighter, sometimes called the "Standard Highlighter" or "Default Highlighter", is Lucene's original highlighter a venerable option with a high degree of customization options.
It's query accuracy is good enough for most needs, although it's not quite as good/perfect as the Unified Highlighter.
+
The Original Highlighter will normally analyze stored text on the fly in order to highlight. It will use full term vectors if available, however in this mode it isn't as fast as the Unified Highlighter or FastVector Highlighter.
The Original Highlighter will normally analyze stored text on the fly in order to highlight. It will use full term vectors if available.
+
This highlighter is a good choice for a wide variety of search use-cases. Where it falls short is performance; it's often twice as slow as the Unified Highlighter. And despite being the most customizable, it doesn't have a BreakIterator based fragmenter (all the others do), which could pose a challenge for some languages.
Where this highlighter falls short is performance; it's often twice as slow as the Unified Highlighter. And despite being the most customizable, it doesn't have a BreakIterator based fragmenter (all the others do), which could pose a challenge for some languages.
<<The FastVector Highlighter,FastVector Highlighter>>:: (`hl.method=fastVector`)
+
@ -171,6 +184,7 @@ This highlighter's query-representation is less advanced than the Original or Un
+
Note that both the FastVector and Original Highlighters can be used in conjunction in a search request to highlight some fields with one and some the other. In contrast, the other highlighters can only be chosen exclusively.
The Unified Highlighter is exclusively configured via search parameters. In contrast, some settings for the Original and FastVector Highlighters are set in `solrconfig.xml`. There's a robust example of the latter in the "```techproducts```" configset.
In addition to further information below, more information can be found in the {solr-javadocs}/solr-core/org/apache/solr/highlight/package-summary.html[Solr javadocs].
@ -242,6 +256,13 @@ Indicates which character to break the text on. Use only if you have defined `hl
+
This is useful when the text has already been manipulated in advance to have a special delineation character at desired highlight passage boundaries. This character will still appear in the text as the last character of a passage.
`hl.weightMatches`::
Tells the UH to use Lucene's new "Weight Matches" API instead of doing SpanQuery conversion.
This is the most accurate highlighting mode reflecting the query.
Furthermore, phrases will be highlighted as a whole instead of word by word.
+
The default is `true`.
However if either `hl.usePhraseHighlighter` or `hl.multiTermQuery` are set to false, then this setting is effectively false no matter what you set it to.
== The Original Highlighter

View File

@ -91,4 +91,5 @@ public interface HighlightParams {
public static final String PHRASE_LIMIT = HIGHLIGHT + ".phraseLimit"; // FVH
public static final String OFFSET_SOURCE = HIGHLIGHT + ".offsetSource"; // UH
public static final String CACHE_FIELD_VAL_CHARS_THRESHOLD = HIGHLIGHT + ".cacheFieldValCharsThreshold"; // UH
public static final String WEIGHT_MATCHES = HIGHLIGHT + ".weightMatches"; // UH
}