SOLR-9935: Add hl.fragsize support when using the UnifiedHighlighter

This commit is contained in:
David Smiley 2017-01-07 23:32:37 -05:00
parent ea49989524
commit 570880d3ac
4 changed files with 35 additions and 8 deletions

View File

@ -109,7 +109,7 @@ Upgrade Notes
* SOLR-9708: You are encouraged to try out the UnifiedHighlighter by setting hl.method=unified and report feedback. It
might become the default in 7.0. It's more efficient/faster than the other highlighters, especially compared to the
original Highlighter. That said, some options aren't supported yet, notably hl.fragsize.
original Highlighter. That said, some options aren't supported yet.
It will get more features in time, especially with your input. See HighlightParams.java
for a listing of highlight parameters annotated with which highlighters use them.
hl.useFastVectorHighlighter is now considered deprecated in lieu of hl.method=fastVector.
@ -225,6 +225,9 @@ New Features
* SOLR-7466: Enable leading wildcard in complexphrase query parser, optimize it with ReversedWildcardFilterFactory
when it's provided (Mikhail Khludnev)
* SOLR-9935: Add hl.fragsize support when using the UnifiedHighlighter to avoid snippets/Passages that are too small.
Defaults to 70. (David Smiley)
Optimizations
----------------------
* SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
@ -299,7 +300,16 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
Locale locale = parseLocale(language, country, variant);
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
return parseBreakIterator(type, locale);
BreakIterator baseBI = parseBreakIterator(type, locale);
// Use a default fragsize the same as the regex Fragmenter (original Highlighter) since we're
// both likely shooting for sentence-like patterns.
int fragsize = params.getFieldInt(field, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE);
if (fragsize <= 1 || baseBI instanceof WholeBreakIterator) { // no real minimum size
return baseBI;
}
return LengthGoalBreakIterator.createMinLength(baseBI, fragsize);
// TODO option for using createClosestToLength()
}
/**

View File

@ -78,7 +78,8 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
"text2", "document one", "text3", "crappy document", "id", "101"));
assertU(commit());
assertQ("multiple snippets test",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.snippets", "2", "hl.bs.type", "SENTENCE"),
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.snippets", "2", "hl.bs.type", "SENTENCE",
"hl.fragsize", "0"),
"count(//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr/str[1]='<em>Document</em> snippet one. '",
"//lst[@name='highlighting']/lst[@name='101']/arr/str[2]='<em>Document</em> snippet two.'");
@ -202,22 +203,35 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'");
}
public void testBreakIterator() {
public void testBreakIteratorWord() {
assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD"),
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD", "hl.fragsize", "-1"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='<em>document</em>'");
}
public void testBreakIterator2() {
public void testBreakIteratorWhole() {
assertU(adoc("text", "Document one has a first sentence. Document two has a second sentence.", "id", "103"));
assertU(commit());
assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE", "hl.fragsize", "-1"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
public void testFragsize() {
// test default is 70... so make a sentence that is a little less (closer to 70 than end of text)
clearIndex();
assertU(adoc("id", "10", "text", "This is a sentence just under seventy chars in length blah blah. Next sentence is here."));
assertU(commit());
assertQ("default fragsize",
req("q", "text:seventy", "hl", "true"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. Next sentence is here.'");
assertQ("smaller fragsize",
req("q", "text:seventy", "hl", "true", "hl.fragsize", "60"), // a bit smaller
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. '");
}
public void testEncoder() {
assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
assertU(commit());

View File

@ -49,7 +49,7 @@ public interface HighlightParams {
public static final String HIGHLIGHT_ALTERNATE = HIGHLIGHT+".highlightAlternate"; // OH, FVH
// sizing
public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH
public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH, UH
public static final String FRAGMENTER = HIGHLIGHT+".fragmenter"; // OH
public static final String INCREMENT = HIGHLIGHT+".increment"; // OH
public static final String REGEX = "regex"; // OH