LUCENE-9093: UnifiedHighlighter LengthGoalBreakIterator frag align

Matches in passages should be centered better on average.
 Closes #1123

(cherry picked from commit 4c9cc2cefd)
This commit is contained in:
Nándor Mátravölgyi 2020-01-01 00:57:00 -05:00 committed by David Smiley
parent a97271fc52
commit 5874b9c793
9 changed files with 280 additions and 104 deletions

View File

@ -7,7 +7,10 @@ http://s.apache.org/luceneversions
API Changes
---------------------
(No changes)
* LUCENE-9093: Not an API change but a change in behavior of the UnifiedHighlighter's LengthGoalBreakIterator that will
yield Passages sized a little different due to the fact that the sizing pivot is now the center of the first match and
not its left edge.
New Features
---------------------
@ -30,6 +33,10 @@ Improvements
* LUCENE-9106: UniformSplit postings format allows extension of block/line serializers. (Bruno Roustant)
* LUCENE-9093: UnifiedHighlighter's LengthGoalBreakIterator has a new fragmentAlignment option to better center the
first match in the passage. Also the sizing point now pivots at the center of the first match term and not its left
edge. This yields Passages that won't be identical to the previous behavior. (Nándor Mátravölgyi, David Smiley)
Optimizations
---------------------
(No changes)

View File

@ -141,6 +141,7 @@ public class FieldHighlighter {
}
});
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
int lastPassageEnd = 0;
do {
int start = off.startOffset();
@ -158,9 +159,12 @@ public class FieldHighlighter {
if (start >= contentLength) {
break;
}
// find fragment from the middle of the match, so the result's length may be closer to fragsize
final int center = start + (end - start) / 2;
// advance breakIterator
passage.setStartOffset(Math.max(this.breakIterator.preceding(start + 1), 0));
passage.setEndOffset(Math.min(this.breakIterator.following(start), contentLength));
passage.setStartOffset(Math.min(start, Math.max(this.breakIterator.preceding(Math.max(start + 1, center)), lastPassageEnd)));
lastPassageEnd = Math.max(end, Math.min(this.breakIterator.following(Math.min(end - 1, center)), contentLength));
passage.setEndOffset(lastPassageEnd);
}
// Add this term to the passage.
BytesRef term = off.getTerm();// a reference; safe to refer to

View File

@ -33,22 +33,49 @@ public class LengthGoalBreakIterator extends BreakIterator {
private final BreakIterator baseIter;
private final int lengthGoal;
private final float fragmentAlignment; // how much text to align before match-fragment, valid in range [0, 1]
private final boolean isMinimumLength; // if false then is "closest to" length
private int currentCache;
/** Breaks will be at least {@code minLength} apart (to the extent possible). */
/**
* Breaks will be at least {@code minLength} apart (to the extent possible),
* while trying to position the match inside the fragment according to {@code fragmentAlignment}.
*/
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength,
float fragmentAlignment) {
return new LengthGoalBreakIterator(baseIter, minLength, fragmentAlignment, true, baseIter.current());
}
/** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
@Deprecated
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
return new LengthGoalBreakIterator(baseIter, minLength, true);
return createMinLength(baseIter, minLength, 0.f);
}
/** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
* is chosen. */
/**
* Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
* is chosen. The match will be positioned according to {@code fragmentAlignment} as much as possible.
*/
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength,
float fragmentAlignment) {
return new LengthGoalBreakIterator(baseIter, targetLength, fragmentAlignment, false, baseIter.current());
}
/** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
@Deprecated
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
return new LengthGoalBreakIterator(baseIter, targetLength, false);
return createClosestToLength(baseIter, targetLength, 0.f);
}
private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, float fragmentAlignment,
boolean isMinimumLength, int currentCache) {
this.baseIter = baseIter;
this.currentCache = currentCache;
this.lengthGoal = lengthGoal;
if (fragmentAlignment < 0.f || fragmentAlignment > 1.f || !Float.isFinite(fragmentAlignment)) {
throw new IllegalArgumentException("fragmentAlignment must be >= zero and <= one");
}
this.fragmentAlignment = fragmentAlignment;
this.isMinimumLength = isMinimumLength;
}
@ -60,12 +87,15 @@ public class LengthGoalBreakIterator extends BreakIterator {
@Override
public String toString() {
String goalDesc = isMinimumLength ? "minLen" : "targetLen";
return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", fragAlign=" + fragmentAlignment +
", baseIter=" + baseIter + "}";
}
@Override
public Object clone() {
return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
return new LengthGoalBreakIterator(
(BreakIterator) baseIter.clone(), lengthGoal, fragmentAlignment, isMinimumLength, currentCache
);
}
@Override
@ -76,26 +106,28 @@ public class LengthGoalBreakIterator extends BreakIterator {
@Override
public void setText(String newText) {
baseIter.setText(newText);
currentCache = baseIter.current();
}
@Override
public void setText(CharacterIterator newText) {
baseIter.setText(newText);
currentCache = baseIter.current();
}
@Override
public int current() {
return baseIter.current();
return currentCache;
}
@Override
public int first() {
return baseIter.first();
return currentCache = baseIter.first();
}
@Override
public int last() {
return baseIter.last();
return currentCache = baseIter.last();
}
@Override
@ -104,10 +136,10 @@ public class LengthGoalBreakIterator extends BreakIterator {
return baseIter.next(n); // probably wrong
}
// called by getSummaryPassagesNoHighlight to generate default summary.
// Called by getSummaryPassagesNoHighlight to generate default summary.
@Override
public int next() {
return following(current());
return following(currentCache, currentCache + lengthGoal);
}
@Override
@ -116,65 +148,70 @@ public class LengthGoalBreakIterator extends BreakIterator {
return baseIter.previous();
}
// called while the current position is the start of a new passage; find end of passage
@Override
public int following(int followingIdx) {
final int startIdx = current();
if (followingIdx < startIdx) {
assert false : "Not supported";
return baseIter.following(followingIdx);
}
final int targetIdx = startIdx + lengthGoal;
// When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
if (followingIdx >= targetIdx - 1) {
return baseIter.following(followingIdx);
}
// If target exceeds the text length, return the last index.
if (targetIdx >= getText().getEndIndex()) {
return baseIter.last();
}
// Find closest break >= the target
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx == DONE) { // we're at the end; can this happen?
return current();
}
if (afterIdx == targetIdx) { // right on the money
return afterIdx;
}
if (isMinimumLength) { // thus never undershoot
return afterIdx;
}
// note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
// Find closest break < target
final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
if (beforeIdx <= followingIdx) { // too far back
return moveToBreak(afterIdx);
}
if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
return beforeIdx;
}
return moveToBreak(afterIdx);
public int following(int matchEndIndex) {
return following(matchEndIndex, (matchEndIndex + 1) + (int)(lengthGoal * (1.f - fragmentAlignment)));
}
private int moveToBreak(int idx) { // precondition: idx is a known break
// bi.isBoundary(idx) has side-effect of moving the position. Not obvious!
//boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
//assert moved && current() == idx;
private int following(int matchEndIndex, int targetIdx) {
if (targetIdx >= getText().getEndIndex()) {
if (currentCache == baseIter.last()) {
return DONE;
}
return currentCache = baseIter.last();
}
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx == DONE) {
currentCache = baseIter.last();
return DONE;
}
if (afterIdx == targetIdx) { // right on the money
return currentCache = afterIdx;
}
if (isMinimumLength) { // thus never undershoot
return currentCache = afterIdx;
}
// TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
int current = baseIter.following(idx - 2);
assert current == idx : "following() didn't move us to the expected index.";
return idx;
// note: it is a shame that we invoke preceding() *one more time*; BI's are sometimes expensive.
// Find closest break to target
final int beforeIdx = baseIter.preceding(targetIdx);
if (targetIdx - beforeIdx < afterIdx - targetIdx && beforeIdx > matchEndIndex) {
return currentCache = beforeIdx;
}
return currentCache = afterIdx;
}
// called at start of new Passage given first word start offset
@Override
public int preceding(int offset) {
return baseIter.preceding(offset); // no change needed
public int preceding(int matchStartIndex) {
final int targetIdx = (matchStartIndex - 1) - (int)(lengthGoal * fragmentAlignment);
if (targetIdx <= 0) {
if (currentCache == baseIter.first()) {
return DONE;
}
return currentCache = baseIter.first();
}
final int beforeIdx = baseIter.preceding(targetIdx + 1);
if (beforeIdx == DONE) {
currentCache = baseIter.first();
return DONE;
}
if (beforeIdx == targetIdx) { // right on the money
return currentCache = beforeIdx;
}
if (isMinimumLength) { // thus never undershoot
return currentCache = beforeIdx;
}
// note: it is a shame that we invoke following() *one more time*; BI's are sometimes expensive.
// Find closest break to target
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx - targetIdx < targetIdx - beforeIdx && afterIdx < matchStartIndex) {
return currentCache = afterIdx;
}
return currentCache = beforeIdx;
}
@Override

View File

@ -18,6 +18,7 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.text.BreakIterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
@ -28,6 +29,7 @@ import org.apache.lucene.util.QueryBuilder;
public class LengthGoalBreakIteratorTest extends LuceneTestCase {
private static final String FIELD = "body";
private static final float[] ALIGNS = {0.f, 0.5f, 1.f};
// We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
// not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence.
@ -39,65 +41,159 @@ public class LengthGoalBreakIteratorTest extends LuceneTestCase {
// 0 1
// 01234567890123456789
static final String CONTENT = "Aa bb. Cc dd. Ee ff";
static final String CONTENT2 = "Aa bb Cc dd X Ee ff Gg hh.";
static final String CONTENT3 = "Aa bbcc ddxyzee ffgg hh.";
public void testFragmentAlignmentConstructor() throws IOException {
BreakIterator baseBI = new CustomSeparatorBreakIterator('.');
// test fragmentAlignment validation
float[] valid_aligns = {0.f, 0.3333f, 0.5f, 0.99f, 1.f};
for (float alignment : valid_aligns) {
LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
}
float[] invalid_aligns = {-0.01f, -1.f, 1.5f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY};
for (float alignment : invalid_aligns) {
expectThrows(IllegalArgumentException.class, () -> {
LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
});
}
// test backwards compatibility constructors
String backwardCompString = LengthGoalBreakIterator.createClosestToLength(baseBI, 50).toString();
assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
backwardCompString = LengthGoalBreakIterator.createMinLength(baseBI, 50).toString();
assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
}
public void testTargetLen() throws IOException {
// "goal" means target length goal to find closest break
// at first word:
Query query = query("aa");
assertEquals("almost two sent",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 9));
assertEquals( "barely two sent",
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 10));
assertEquals("long goal",
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20)));
assertEquals("almost two sent A",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 7, 0.f));
assertEquals("almost two sent B",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 15, 0.5f));
assertEquals("almost two sent C",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 64, 1.f));
assertEquals("barely two sent A",
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 8, 0.f));
assertEquals("barely two sent B",
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 16, 0.5f));
assertEquals("long goal A",
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 14 + random().nextInt(20), 0.f));
assertEquals("long goal B",
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 28 + random().nextInt(20), 0.5f));
// at some word not at start of passage
query = query("dd");
assertEquals("short goal",
" Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, random().nextInt(5)));
assertEquals("almost two sent",
" Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 10));
assertEquals("barely two sent",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11));
assertEquals("long goal",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 12 + random().nextInt(20)));
for (float align : ALIGNS) {
// alignment is not meaningful if fragsize is shorter than or closer to match-fragment boundaries
assertEquals("short goal " + align,
" Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, random().nextInt(4), align));
}
// preceding/following inclusion by alignment parameter
assertEquals("barely two sent A",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.f));
assertEquals("barely two sent B",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.5f));
assertEquals("barely two sent C",
"Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 11, 1.f));
assertEquals("long goal A",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.f));
assertEquals("long goal B",
"Aa bb. Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.5f));
assertEquals("long goal C",
"Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 1.f));
query = query("ddxyzee");
assertEquals("test fragment search from the middle of the match; almost including",
"<b>ddxyzee</b> ", highlightClosestToLen(CONTENT3, query, 7, 0.5f, 1, ' '));
assertEquals("test fragment search from the middle of the match; barely including",
"bbcc <b>ddxyzee</b> ffgg ", highlightClosestToLen(CONTENT3, query, 14, 0.5f, 1, ' '));
}
public void testMinLen() throws IOException {
// minLen mode is simpler than targetLen... just test a few cases
Query query = query("dd");
assertEquals("almost two sent",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 6));
assertEquals("barely two sent",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 7));
assertEquals("almost two sent A",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 0, 0.f));
assertEquals("almost two sent B",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 1, 0.5f));
assertEquals("almost two sent C",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 5, 1.f));
assertEquals("barely two sent A",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 1, 0.f));
assertEquals("barely two sent B",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 2, 0.5f));
assertEquals("barely two sent C",
"Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 7, 1.f));
assertEquals("barely two sent D/a",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 2, 0.55f));
assertEquals("barely two sent D/b",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 3, 0.55f));
assertEquals("barely two sent E/a",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.5f));
assertEquals("barely two sent E/b",
"Aa bb. Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.7f));
assertEquals("barely two sent E/c",
"Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 9, 0.9f));
query = query("ddxyzee");
assertEquals("test fragment search from the middle of the match; almost including",
"<b>ddxyzee</b> ", highlightMinLen(CONTENT3, query, 7, 0.5f, ' '));
assertEquals("test fragment search from the middle of the match; barely including",
"bbcc <b>ddxyzee</b> ffgg ", highlightMinLen(CONTENT3, query, 8, 0.5f, ' '));
}
public void testMinLenPrecision() throws IOException {
Query query = query("x");
assertEquals("test absolute minimal length",
"<b>X</b> ", highlightMinLen(CONTENT2, query, 1, 0.5f, ' '));
assertEquals("test slightly above minimal length",
"dd <b>X</b> Ee ", highlightMinLen(CONTENT2, query, 4, 0.5f, ' '));
}
public void testDefaultSummaryTargetLen() throws IOException {
Query query = query("zz");
assertEquals("Aa bb.",
highlightClosestToLen(CONTENT, query, random().nextInt(10))); // < 10
for (float align : ALIGNS) { // alignment is not used for creating default-summary
assertEquals("Aa bb.",
highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), align));
assertEquals("Aa bb. Cc dd.",
highlightClosestToLen(CONTENT, query, 12 + random().nextInt(4), align));
assertEquals("Aa bb. Cc dd. Ee ff",
highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), align));
}
assertEquals("Aa bb. Cc dd.",
highlightClosestToLen(CONTENT, query, 10 + 6)); // cusp of adding 3rd sentence
assertEquals("Aa bb. Cc dd. Ee ff",
highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20))); // >= 14
highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), 0.f, 2));
}
private Query query(String qStr) {
return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
}
private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException {
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
return highlightClosestToLen(content, query, lengthGoal, fragAlign, 1);
}
private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException {
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages) throws IOException {
return highlightClosestToLen(content, query, lengthGoal, fragAlign, maxPassages, '.');
}
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages, char separator) throws IOException {
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
return highlighter.highlightWithoutSearcher(FIELD, query, content, maxPassages).toString();
}
private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
return highlightMinLen(content, query, lengthGoal, fragAlign, '.');
}
private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign, char separator) throws IOException {
// differs from above only by "createMinLength"
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
}
}

View File

@ -105,6 +105,10 @@ Improvements
* SOLR-14129: Reuse Jackson ObjectMapper in AuditLoggerPlugin (janhoy)
* LUCENE-9093: The Unified highlighter has two new passage sizing parameters, hl.fragAlignRatio and
hl.fragsizeIsMinimum, with defaults that aim to better center matches in fragments than previously. See the ref guide.
Regardless of the settings, the passages may be sized differently than before. (Nándor Mátravölgyi, David Smiley)
Optimizations
---------------------
(No changes)

View File

@ -326,8 +326,11 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
if (fragsize <= 1) { // no real minimum size
return baseBI;
}
return LengthGoalBreakIterator.createMinLength(baseBI, fragsize);
// TODO option for using createClosestToLength()
float fragalign = params.getFieldFloat(field, HighlightParams.FRAGALIGNRATIO, 0.5f);
if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, false)) {
return LengthGoalBreakIterator.createMinLength(baseBI, fragsize, fragalign);
}
return LengthGoalBreakIterator.createClosestToLength(baseBI, fragsize, fragalign);
}
/**

View File

@ -244,15 +244,24 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
assertU(commit());
assertQ("CUSTOM breakiterator",
assertQ("CUSTOM breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
assertQ("different breakiterator",
assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
"//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other <em>document</em> contains the same #'");
assertQ("CUSTOM breakiterator with fragsize 70 minimum",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70", "hl.fragsizeIsMinimum", "true"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains # special characters, while the other <em>document</em> contains the same #'");
assertQ("CUSTOM breakiterator with fragsize 70",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
assertQ("CUSTOM breakiterator with fragsize 90",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "90"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
assertQ("CUSTOM breakiterator with fragsize 100",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "100"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains # special characters, while the other <em>document</em> contains the same #'");
}
@ -262,11 +271,17 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertU(adoc("id", "10", "text", "This is a sentence just under seventy chars in length blah blah. Next sentence is here."));
assertU(commit());
assertQ("default fragsize",
req("q", "text:seventy", "hl", "true"),
req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. Next sentence is here.'");
assertQ("smaller fragsize",
req("q", "text:seventy", "hl", "true", "hl.fragsize", "60"), // a bit smaller
assertQ("default fragsize",
req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true", "hl.fragsize", "60"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. '");
assertQ("smaller fragsize",
req("q", "text:seventy", "hl", "true"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. '");
assertQ("default fragsize",
req("q", "text:seventy", "hl", "true", "hl.fragsize", "90"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. Next sentence is here.'");
}
public void testEncoder() {

View File

@ -225,6 +225,14 @@ By default, the Unified Highlighter will usually pick the right offset source (s
+
The offset source can be explicitly configured to one of: `ANALYSIS`, `POSTINGS`, `POSTINGS_WITH_TERM_VECTORS`, or `TERM_VECTORS`.
`hl.fragAlignRatio`::
Fragment alignment can influence where the match in a passage is positioned. This floating point value is used to break the `hl.fragsize` around the match. The default value of `0.5` means to align the match to the middle. A value of `0.0` would mean to align the match to the left, while `1.0` to align it to the right. _Note: there are situations where the requested alignment is not plausible. This depends on the length of the match, the used breakiterator and the text content around the match._
`hl.fragsizeIsMinimum`::
By default this parameter is `false`. In this case the highlighter will accept a shorter fragment than `hl.fragsize`, if that is closer to the target than what the next longer choice would be.
+
When treating `hl.fragsize` as a strict minimum length is acceptable, turning this on has some performance benefits to consider.
`hl.tag.ellipsis`::
By default, each snippet is returned as a separate value (as is done with the other highlighters). Set this parameter to instead return one string with this text as the delimiter. _Note: this is likely to be removed in the future._

View File

@ -47,6 +47,8 @@ public interface HighlightParams {
// sizing
public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH, UH
public static final String FRAGSIZEISMINIMUM = HIGHLIGHT+".fragsizeIsMinimum"; // UH
public static final String FRAGALIGNRATIO = HIGHLIGHT+".fragAlignRatio"; // UH
public static final String FRAGMENTER = HIGHLIGHT+".fragmenter"; // OH
public static final String INCREMENT = HIGHLIGHT+".increment"; // OH
public static final String REGEX = "regex"; // OH