diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3fa7dc49ff9..54f01c5c278 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -83,7 +83,10 @@ Other
API Changes
---------------------
-(No changes)
+
+* LUCENE-9093: Not an API change but a change in behavior of the UnifiedHighlighter's LengthGoalBreakIterator that will
+ yield Passages sized a little different due to the fact that the sizing pivot is now the center of the first match and
+ not its left edge.
New Features
---------------------
@@ -106,6 +109,10 @@ Improvements
* LUCENE-9106: UniformSplit postings format allows extension of block/line serializers. (Bruno Roustant)
+* LUCENE-9093: UnifiedHighlighter's LengthGoalBreakIterator has a new fragmentAlignment option to better center the
+ first match in the passage. Also the sizing point now pivots at the center of the first match term and not its left
+ edge. This yields Passages that won't be identical to the previous behavior. (Nándor Mátravölgyi, David Smiley)
+
Optimizations
---------------------
(No changes)
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
index f2a34dc9ea5..5e45e3f40a9 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java
@@ -141,6 +141,7 @@ public class FieldHighlighter {
}
});
Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue.
+ int lastPassageEnd = 0;
do {
int start = off.startOffset();
@@ -158,9 +159,12 @@ public class FieldHighlighter {
if (start >= contentLength) {
break;
}
+ // find fragment from the middle of the match, so the result's length may be closer to fragsize
+ final int center = start + (end - start) / 2;
// advance breakIterator
- passage.setStartOffset(Math.max(this.breakIterator.preceding(start + 1), 0));
- passage.setEndOffset(Math.min(this.breakIterator.following(start), contentLength));
+ passage.setStartOffset(Math.min(start, Math.max(this.breakIterator.preceding(Math.max(start + 1, center)), lastPassageEnd)));
+ lastPassageEnd = Math.max(end, Math.min(this.breakIterator.following(Math.min(end - 1, center)), contentLength));
+ passage.setEndOffset(lastPassageEnd);
}
// Add this term to the passage.
BytesRef term = off.getTerm();// a reference; safe to refer to
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
index 3134013ab35..a4b5a07f63b 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
@@ -33,22 +33,49 @@ public class LengthGoalBreakIterator extends BreakIterator {
private final BreakIterator baseIter;
private final int lengthGoal;
+ private final float fragmentAlignment; // how much text to align before match-fragment, valid in range [0, 1]
private final boolean isMinimumLength; // if false then is "closest to" length
+ private int currentCache;
- /** Breaks will be at least {@code minLength} apart (to the extent possible). */
+ /**
+ * Breaks will be at least {@code minLength} apart (to the extent possible),
+ * while trying to position the match inside the fragment according to {@code fragmentAlignment}.
+ */
+ public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength,
+ float fragmentAlignment) {
+ return new LengthGoalBreakIterator(baseIter, minLength, fragmentAlignment, true, baseIter.current());
+ }
+
+ /** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
+ @Deprecated
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
- return new LengthGoalBreakIterator(baseIter, minLength, true);
+ return createMinLength(baseIter, minLength, 0.f);
}
- /** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
- * is chosen. */
+ /**
+ * Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
+ * is chosen. The match will be positioned according to {@code fragmentAlignment} as much as possible.
+ */
+ public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength,
+ float fragmentAlignment) {
+ return new LengthGoalBreakIterator(baseIter, targetLength, fragmentAlignment, false, baseIter.current());
+ }
+
+ /** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
+ @Deprecated
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
- return new LengthGoalBreakIterator(baseIter, targetLength, false);
+ return createClosestToLength(baseIter, targetLength, 0.f);
}
- private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
+ private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, float fragmentAlignment,
+ boolean isMinimumLength, int currentCache) {
this.baseIter = baseIter;
+ this.currentCache = currentCache;
this.lengthGoal = lengthGoal;
+ if (fragmentAlignment < 0.f || fragmentAlignment > 1.f || !Float.isFinite(fragmentAlignment)) {
+ throw new IllegalArgumentException("fragmentAlignment must be >= zero and <= one");
+ }
+ this.fragmentAlignment = fragmentAlignment;
this.isMinimumLength = isMinimumLength;
}
@@ -60,12 +87,15 @@ public class LengthGoalBreakIterator extends BreakIterator {
@Override
public String toString() {
String goalDesc = isMinimumLength ? "minLen" : "targetLen";
- return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
+ return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", fragAlign=" + fragmentAlignment +
+ ", baseIter=" + baseIter + "}";
}
@Override
public Object clone() {
- return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
+ return new LengthGoalBreakIterator(
+ (BreakIterator) baseIter.clone(), lengthGoal, fragmentAlignment, isMinimumLength, currentCache
+ );
}
@Override
@@ -76,26 +106,28 @@ public class LengthGoalBreakIterator extends BreakIterator {
@Override
public void setText(String newText) {
baseIter.setText(newText);
+ currentCache = baseIter.current();
}
@Override
public void setText(CharacterIterator newText) {
baseIter.setText(newText);
+ currentCache = baseIter.current();
}
@Override
public int current() {
- return baseIter.current();
+ return currentCache;
}
@Override
public int first() {
- return baseIter.first();
+ return currentCache = baseIter.first();
}
@Override
public int last() {
- return baseIter.last();
+ return currentCache = baseIter.last();
}
@Override
@@ -104,10 +136,10 @@ public class LengthGoalBreakIterator extends BreakIterator {
return baseIter.next(n); // probably wrong
}
- // called by getSummaryPassagesNoHighlight to generate default summary.
+ // Called by getSummaryPassagesNoHighlight to generate default summary.
@Override
public int next() {
- return following(current());
+ return following(currentCache, currentCache + lengthGoal);
}
@Override
@@ -116,65 +148,70 @@ public class LengthGoalBreakIterator extends BreakIterator {
return baseIter.previous();
}
- // called while the current position is the start of a new passage; find end of passage
@Override
- public int following(int followingIdx) {
- final int startIdx = current();
- if (followingIdx < startIdx) {
- assert false : "Not supported";
- return baseIter.following(followingIdx);
- }
- final int targetIdx = startIdx + lengthGoal;
- // When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
- if (followingIdx >= targetIdx - 1) {
- return baseIter.following(followingIdx);
- }
- // If target exceeds the text length, return the last index.
- if (targetIdx >= getText().getEndIndex()) {
- return baseIter.last();
- }
-
- // Find closest break >= the target
- final int afterIdx = baseIter.following(targetIdx - 1);
- if (afterIdx == DONE) { // we're at the end; can this happen?
- return current();
- }
- if (afterIdx == targetIdx) { // right on the money
- return afterIdx;
- }
- if (isMinimumLength) { // thus never undershoot
- return afterIdx;
- }
-
- // note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
-
- // Find closest break < target
- final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
- if (beforeIdx <= followingIdx) { // too far back
- return moveToBreak(afterIdx);
- }
-
- if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
- return beforeIdx;
- }
- return moveToBreak(afterIdx);
+ public int following(int matchEndIndex) {
+ return following(matchEndIndex, (matchEndIndex + 1) + (int)(lengthGoal * (1.f - fragmentAlignment)));
}
- private int moveToBreak(int idx) { // precondition: idx is a known break
- // bi.isBoundary(idx) has side-effect of moving the position. Not obvious!
- //boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
- //assert moved && current() == idx;
+ private int following(int matchEndIndex, int targetIdx) {
+ if (targetIdx >= getText().getEndIndex()) {
+ if (currentCache == baseIter.last()) {
+ return DONE;
+ }
+ return currentCache = baseIter.last();
+ }
+ final int afterIdx = baseIter.following(targetIdx - 1);
+ if (afterIdx == DONE) {
+ currentCache = baseIter.last();
+ return DONE;
+ }
+ if (afterIdx == targetIdx) { // right on the money
+ return currentCache = afterIdx;
+ }
+ if (isMinimumLength) { // thus never undershoot
+ return currentCache = afterIdx;
+ }
- // TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
- int current = baseIter.following(idx - 2);
- assert current == idx : "following() didn't move us to the expected index.";
- return idx;
+ // note: it is a shame that we invoke preceding() *one more time*; BI's are sometimes expensive.
+
+ // Find closest break to target
+ final int beforeIdx = baseIter.preceding(targetIdx);
+ if (targetIdx - beforeIdx < afterIdx - targetIdx && beforeIdx > matchEndIndex) {
+ return currentCache = beforeIdx;
+ }
+ return currentCache = afterIdx;
}
// called at start of new Passage given first word start offset
@Override
- public int preceding(int offset) {
- return baseIter.preceding(offset); // no change needed
+ public int preceding(int matchStartIndex) {
+ final int targetIdx = (matchStartIndex - 1) - (int)(lengthGoal * fragmentAlignment);
+ if (targetIdx <= 0) {
+ if (currentCache == baseIter.first()) {
+ return DONE;
+ }
+ return currentCache = baseIter.first();
+ }
+ final int beforeIdx = baseIter.preceding(targetIdx + 1);
+ if (beforeIdx == DONE) {
+ currentCache = baseIter.first();
+ return DONE;
+ }
+ if (beforeIdx == targetIdx) { // right on the money
+ return currentCache = beforeIdx;
+ }
+ if (isMinimumLength) { // thus never undershoot
+ return currentCache = beforeIdx;
+ }
+
+ // note: it is a shame that we invoke following() *one more time*; BI's are sometimes expensive.
+
+ // Find closest break to target
+ final int afterIdx = baseIter.following(targetIdx - 1);
+ if (afterIdx - targetIdx < targetIdx - beforeIdx && afterIdx < matchStartIndex) {
+ return currentCache = afterIdx;
+ }
+ return currentCache = beforeIdx;
}
@Override
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
index 2434f5259e5..c2406637bd3 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
@@ -18,6 +18,7 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
+import java.text.BreakIterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
@@ -28,6 +29,7 @@ import org.apache.lucene.util.QueryBuilder;
public class LengthGoalBreakIteratorTest extends LuceneTestCase {
private static final String FIELD = "body";
+ private static final float[] ALIGNS = {0.f, 0.5f, 1.f};
// We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
// not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence.
@@ -39,65 +41,159 @@ public class LengthGoalBreakIteratorTest extends LuceneTestCase {
// 0 1
// 01234567890123456789
static final String CONTENT = "Aa bb. Cc dd. Ee ff";
+ static final String CONTENT2 = "Aa bb Cc dd X Ee ff Gg hh.";
+ static final String CONTENT3 = "Aa bbcc ddxyzee ffgg hh.";
+
+ public void testFragmentAlignmentConstructor() throws IOException {
+ BreakIterator baseBI = new CustomSeparatorBreakIterator('.');
+ // test fragmentAlignment validation
+ float[] valid_aligns = {0.f, 0.3333f, 0.5f, 0.99f, 1.f};
+ for (float alignment : valid_aligns) {
+ LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
+ }
+ float[] invalid_aligns = {-0.01f, -1.f, 1.5f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY};
+ for (float alignment : invalid_aligns) {
+ expectThrows(IllegalArgumentException.class, () -> {
+ LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
+ });
+ }
+ // test backwards compatibility constructors
+ String backwardCompString = LengthGoalBreakIterator.createClosestToLength(baseBI, 50).toString();
+ assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
+ backwardCompString = LengthGoalBreakIterator.createMinLength(baseBI, 50).toString();
+ assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
+ }
public void testTargetLen() throws IOException {
// "goal" means target length goal to find closest break
// at first word:
Query query = query("aa");
- assertEquals("almost two sent",
- "Aa bb.", highlightClosestToLen(CONTENT, query, 9));
- assertEquals( "barely two sent",
- "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 10));
- assertEquals("long goal",
- "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20)));
-
+ assertEquals("almost two sent A",
+ "Aa bb.", highlightClosestToLen(CONTENT, query, 7, 0.f));
+ assertEquals("almost two sent B",
+ "Aa bb.", highlightClosestToLen(CONTENT, query, 15, 0.5f));
+ assertEquals("almost two sent C",
+ "Aa bb.", highlightClosestToLen(CONTENT, query, 64, 1.f));
+ assertEquals("barely two sent A",
+ "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 8, 0.f));
+ assertEquals("barely two sent B",
+ "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 16, 0.5f));
+ assertEquals("long goal A",
+ "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 14 + random().nextInt(20), 0.f));
+ assertEquals("long goal B",
+ "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 28 + random().nextInt(20), 0.5f));
// at some word not at start of passage
query = query("dd");
- assertEquals("short goal",
- " Cc dd.", highlightClosestToLen(CONTENT, query, random().nextInt(5)));
- assertEquals("almost two sent",
- " Cc dd.", highlightClosestToLen(CONTENT, query, 10));
- assertEquals("barely two sent",
- " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11));
- assertEquals("long goal",
- " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 12 + random().nextInt(20)));
+ for (float align : ALIGNS) {
+ // alignment is not meaningful if fragsize is shorter than or closer to match-fragment boundaries
+ assertEquals("short goal " + align,
+ " Cc dd.", highlightClosestToLen(CONTENT, query, random().nextInt(4), align));
+ }
+ // preceding/following inclusion by alignment parameter
+ assertEquals("barely two sent A",
+ " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.f));
+ assertEquals("barely two sent B",
+ " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.5f));
+ assertEquals("barely two sent C",
+ "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 11, 1.f));
+ assertEquals("long goal A",
+ " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.f));
+ assertEquals("long goal B",
+ "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.5f));
+ assertEquals("long goal C",
+ "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 1.f));
+
+ query = query("ddxyzee");
+ assertEquals("test fragment search from the middle of the match; almost including",
+ "ddxyzee ", highlightClosestToLen(CONTENT3, query, 7, 0.5f, 1, ' '));
+ assertEquals("test fragment search from the middle of the match; barely including",
+ "bbcc ddxyzee ffgg ", highlightClosestToLen(CONTENT3, query, 14, 0.5f, 1, ' '));
}
public void testMinLen() throws IOException {
// minLen mode is simpler than targetLen... just test a few cases
Query query = query("dd");
- assertEquals("almost two sent",
- " Cc dd.", highlightMinLen(CONTENT, query, 6));
- assertEquals("barely two sent",
- " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 7));
+ assertEquals("almost two sent A",
+ " Cc dd.", highlightMinLen(CONTENT, query, 0, 0.f));
+ assertEquals("almost two sent B",
+ " Cc dd.", highlightMinLen(CONTENT, query, 1, 0.5f));
+ assertEquals("almost two sent C",
+ " Cc dd.", highlightMinLen(CONTENT, query, 5, 1.f));
+
+ assertEquals("barely two sent A",
+ " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 1, 0.f));
+ assertEquals("barely two sent B",
+ " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 2, 0.5f));
+ assertEquals("barely two sent C",
+ "Aa bb. Cc dd.", highlightMinLen(CONTENT, query, 7, 1.f));
+ assertEquals("barely two sent D/a",
+ " Cc dd.", highlightMinLen(CONTENT, query, 2, 0.55f));
+ assertEquals("barely two sent D/b",
+ " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 3, 0.55f));
+ assertEquals("barely two sent E/a",
+ " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 10, 0.5f));
+ assertEquals("barely two sent E/b",
+ "Aa bb. Cc dd. Ee ff", highlightMinLen(CONTENT, query, 10, 0.7f));
+ assertEquals("barely two sent E/c",
+ "Aa bb. Cc dd.", highlightMinLen(CONTENT, query, 9, 0.9f));
+
+ query = query("ddxyzee");
+ assertEquals("test fragment search from the middle of the match; almost including",
+ "ddxyzee ", highlightMinLen(CONTENT3, query, 7, 0.5f, ' '));
+ assertEquals("test fragment search from the middle of the match; barely including",
+ "bbcc ddxyzee ffgg ", highlightMinLen(CONTENT3, query, 8, 0.5f, ' '));
+ }
+
+ public void testMinLenPrecision() throws IOException {
+ Query query = query("x");
+ assertEquals("test absolute minimal length",
+ "X ", highlightMinLen(CONTENT2, query, 1, 0.5f, ' '));
+ assertEquals("test slightly above minimal length",
+ "dd X Ee ", highlightMinLen(CONTENT2, query, 4, 0.5f, ' '));
}
public void testDefaultSummaryTargetLen() throws IOException {
Query query = query("zz");
- assertEquals("Aa bb.",
- highlightClosestToLen(CONTENT, query, random().nextInt(10))); // < 10
+ for (float align : ALIGNS) { // alignment is not used for creating default-summary
+ assertEquals("Aa bb.",
+ highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), align));
+ assertEquals("Aa bb. Cc dd.",
+ highlightClosestToLen(CONTENT, query, 12 + random().nextInt(4), align));
+ assertEquals("Aa bb. Cc dd. Ee ff",
+ highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), align));
+ }
assertEquals("Aa bb. Cc dd.",
- highlightClosestToLen(CONTENT, query, 10 + 6)); // cusp of adding 3rd sentence
- assertEquals("Aa bb. Cc dd. Ee ff",
- highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20))); // >= 14
+ highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), 0.f, 2));
}
private Query query(String qStr) {
return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
}
- private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException {
- UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
- highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
- return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
+ private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
+ return highlightClosestToLen(content, query, lengthGoal, fragAlign, 1);
}
- private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException {
+ private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages) throws IOException {
+ return highlightClosestToLen(content, query, lengthGoal, fragAlign, maxPassages, '.');
+ }
+
+ private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages, char separator) throws IOException {
+ UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
+ highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
+ return highlighter.highlightWithoutSearcher(FIELD, query, content, maxPassages).toString();
+ }
+
+ private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
+ return highlightMinLen(content, query, lengthGoal, fragAlign, '.');
+ }
+
+ private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign, char separator) throws IOException {
// differs from above only by "createMinLength"
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
- highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
+ highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
}
}
\ No newline at end of file
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index f124612c148..a21cebadb73 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -184,6 +184,10 @@ Improvements
* SOLR-14129: Reuse Jackson ObjectMapper in AuditLoggerPlugin (janhoy)
+* LUCENE-9093: The Unified highlighter has two new passage sizing parameters, hl.fragAlignRatio and
+ hl.fragsizeIsMinimum, with defaults that aim to better center matches in fragments than previously. See the ref guide.
+ Regardless of the settings, the passages may be sized differently than before. (Nándor Mátravölgyi, David Smiley)
+
Optimizations
---------------------
(No changes)
diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
index fc43687eaf6..90632146ecf 100644
--- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
@@ -326,8 +326,11 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
if (fragsize <= 1) { // no real minimum size
return baseBI;
}
- return LengthGoalBreakIterator.createMinLength(baseBI, fragsize);
- // TODO option for using createClosestToLength()
+ float fragalign = params.getFieldFloat(field, HighlightParams.FRAGALIGNRATIO, 0.5f);
+ if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, false)) {
+ return LengthGoalBreakIterator.createMinLength(baseBI, fragsize, fragalign);
+ }
+ return LengthGoalBreakIterator.createClosestToLength(baseBI, fragsize, fragalign);
}
/**
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
index ac3b7eaddcd..f6418cb8595 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
@@ -244,15 +244,24 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
assertU(commit());
- assertQ("CUSTOM breakiterator",
+ assertQ("CUSTOM breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'");
- assertQ("different breakiterator",
+ assertQ("different breakiterator",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
"//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other document contains the same #'");
+ assertQ("CUSTOM breakiterator with fragsize 70 minimum",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70", "hl.fragsizeIsMinimum", "true"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains # special characters, while the other document contains the same #'");
assertQ("CUSTOM breakiterator with fragsize 70",
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'");
+ assertQ("CUSTOM breakiterator with fragsize 90",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "90"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'");
+ assertQ("CUSTOM breakiterator with fragsize 100",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "100"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains # special characters, while the other document contains the same #'");
}
@@ -262,11 +271,17 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
assertU(adoc("id", "10", "text", "This is a sentence just under seventy chars in length blah blah. Next sentence is here."));
assertU(commit());
assertQ("default fragsize",
- req("q", "text:seventy", "hl", "true"),
+ req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. Next sentence is here.'");
- assertQ("smaller fragsize",
- req("q", "text:seventy", "hl", "true", "hl.fragsize", "60"), // a bit smaller
+ assertQ("default fragsize",
+ req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true", "hl.fragsize", "60"),
"//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. '");
+ assertQ("smaller fragsize",
+ req("q", "text:seventy", "hl", "true"),
+ "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. '");
+ assertQ("default fragsize",
+ req("q", "text:seventy", "hl", "true", "hl.fragsize", "90"),
+ "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. Next sentence is here.'");
}
public void testEncoder() {
diff --git a/solr/solr-ref-guide/src/highlighting.adoc b/solr/solr-ref-guide/src/highlighting.adoc
index 8055b79d7de..ab6b5a6da0c 100644
--- a/solr/solr-ref-guide/src/highlighting.adoc
+++ b/solr/solr-ref-guide/src/highlighting.adoc
@@ -225,6 +225,14 @@ By default, the Unified Highlighter will usually pick the right offset source (s
+
The offset source can be explicitly configured to one of: `ANALYSIS`, `POSTINGS`, `POSTINGS_WITH_TERM_VECTORS`, or `TERM_VECTORS`.
+`hl.fragAlignRatio`::
+Fragment alignment can influence where the match in a passage is positioned. This floating point value is used to break the `hl.fragsize` around the match. The default value of `0.5` means to align the match to the middle. A value of `0.0` would mean to align the match to the left, while `1.0` to align it to the right. _Note: there are situations where the requested alignment is not plausible. This depends on the length of the match, the used breakiterator and the text content around the match._
+
+`hl.fragsizeIsMinimum`::
+By default this parameter is `false`. In this case the highlighter will accept a shorter fragment than `hl.fragsize`, if that is closer to the target than what the next longer choice would be.
++
+When treating `hl.fragsize` as a strict minimum length is acceptable, turning this on has some performance benefits to consider.
+
`hl.tag.ellipsis`::
By default, each snippet is returned as a separate value (as is done with the other highlighters). Set this parameter to instead return one string with this text as the delimiter. _Note: this is likely to be removed in the future._
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
index c4825b70627..c5d59800fb4 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
@@ -47,6 +47,8 @@ public interface HighlightParams {
// sizing
public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH, UH
+ public static final String FRAGSIZEISMINIMUM = HIGHLIGHT+".fragsizeIsMinimum"; // UH
+ public static final String FRAGALIGNRATIO = HIGHLIGHT+".fragAlignRatio"; // UH
public static final String FRAGMENTER = HIGHLIGHT+".fragmenter"; // OH
public static final String INCREMENT = HIGHLIGHT+".increment"; // OH
public static final String REGEX = "regex"; // OH