From 570880d3acb45c925e8dc77172e0725ab8ba07b8 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Sat, 7 Jan 2017 23:32:37 -0500 Subject: [PATCH] SOLR-9935: Add hl.fragsize support when using the UnifiedHighlighter --- solr/CHANGES.txt | 5 +++- .../highlight/UnifiedSolrHighlighter.java | 12 +++++++++- .../highlight/TestUnifiedSolrHighlighter.java | 24 +++++++++++++++---- .../solr/common/params/HighlightParams.java | 2 +- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 899dcd38248..c18381ec2a2 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -109,7 +109,7 @@ Upgrade Notes * SOLR-9708: You are encouraged to try out the UnifiedHighlighter by setting hl.method=unified and report feedback. It might become the default in 7.0. It's more efficient/faster than the other highlighters, especially compared to the - original Highlighter. That said, some options aren't supported yet, notably hl.fragsize. + original Highlighter. That said, some options aren't supported yet. It will get more features in time, especially with your input. See HighlightParams.java for a listing of highlight parameters annotated with which highlighters use them. hl.useFastVectorHighlighter is now considered deprecated in lieu of hl.method=fastVector. @@ -225,6 +225,9 @@ New Features * SOLR-7466: Enable leading wildcard in complexphrase query parser, optimize it with ReversedWildcardFilterFactory when it's provided (Mikhail Khludnev) +* SOLR-9935: Add hl.fragsize support when using the UnifiedHighlighter to avoid snippets/Passages that are too small. + Defaults to 70. (David Smiley) + Optimizations ---------------------- * SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java index 910fa2b8d75..5b59b856167 100644 --- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Query; import org.apache.lucene.search.postingshighlight.WholeBreakIterator; import org.apache.lucene.search.uhighlight.DefaultPassageFormatter; +import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator; import org.apache.lucene.search.uhighlight.PassageFormatter; import org.apache.lucene.search.uhighlight.PassageScorer; import org.apache.lucene.search.uhighlight.UnifiedHighlighter; @@ -299,7 +300,16 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT); Locale locale = parseLocale(language, country, variant); String type = params.getFieldParam(field, HighlightParams.BS_TYPE); - return parseBreakIterator(type, locale); + BreakIterator baseBI = parseBreakIterator(type, locale); + + // Use a default fragsize the same as the regex Fragmenter (original Highlighter) since we're + // both likely shooting for sentence-like patterns. + int fragsize = params.getFieldInt(field, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE); + if (fragsize <= 1 || baseBI instanceof WholeBreakIterator) { // no real minimum size + return baseBI; + } + return LengthGoalBreakIterator.createMinLength(baseBI, fragsize); + // TODO option for using createClosestToLength() } /** diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java index e2511bef53d..2eb4ba3f8c5 100644 --- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java +++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java @@ -78,7 +78,8 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 { "text2", "document one", "text3", "crappy document", "id", "101")); assertU(commit()); assertQ("multiple snippets test", - req("q", "text:document", "sort", "id asc", "hl", "true", "hl.snippets", "2", "hl.bs.type", "SENTENCE"), + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.snippets", "2", "hl.bs.type", "SENTENCE", + "hl.fragsize", "0"), "count(//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/*)=2", "//lst[@name='highlighting']/lst[@name='101']/arr/str[1]='Document snippet one. '", "//lst[@name='highlighting']/lst[@name='101']/arr/str[2]='Document snippet two.'"); @@ -202,21 +203,34 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 { "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'"); } - public void testBreakIterator() { + public void testBreakIteratorWord() { assertQ("different breakiterator", - req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD"), + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD", "hl.fragsize", "-1"), "count(//lst[@name='highlighting']/*)=2", "//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='document'", "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='document'"); } - public void testBreakIterator2() { + public void testBreakIteratorWhole() { assertU(adoc("text", "Document one has a first sentence. Document two has a second sentence.", "id", "103")); assertU(commit()); assertQ("different breakiterator", - req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"), + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE", "hl.fragsize", "-1"), "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='Document one has a first sentence. Document two has a second sentence.'"); } + + public void testFragsize() { + // test default is 70... so make a sentence that is a little less (closer to 70 than end of text) + clearIndex(); + assertU(adoc("id", "10", "text", "This is a sentence just under seventy chars in length blah blah. Next sentence is here.")); + assertU(commit()); + assertQ("default fragsize", + req("q", "text:seventy", "hl", "true"), + "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. Next sentence is here.'"); + assertQ("smaller fragsize", + req("q", "text:seventy", "hl", "true", "hl.fragsize", "60"), // a bit smaller + "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. '"); + } public void testEncoder() { assertU(adoc("text", "Document one has a first sentence.", "id", "103")); diff --git a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java index 917e9f57926..997fc7ecf49 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java @@ -49,7 +49,7 @@ public interface HighlightParams { public static final String HIGHLIGHT_ALTERNATE = HIGHLIGHT+".highlightAlternate"; // OH, FVH // sizing - public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH + public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH, UH public static final String FRAGMENTER = HIGHLIGHT+".fragmenter"; // OH public static final String INCREMENT = HIGHLIGHT+".increment"; // OH public static final String REGEX = "regex"; // OH