diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
index 1cd5fb9340d..cd40046f738 100644
--- a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -23,15 +23,23 @@ import java.text.CharacterIterator;
import java.util.Locale;
/**
- * A custom break iterator that scans text to find break-delimited passages bounded by
- * a provided maximum length. This class delegates the boundary search to a first level
- * break iterator. When this break iterator finds a passage greater than the maximum length
+ * A custom break iterator that is used to find break-delimited passages bounded by
+ * a provided maximum length in the {@link UnifiedHighlighter} context.
+ * This class uses a {@link BreakIterator} to find the last break after the provided offset
+ * that would create a passage smaller than maxLen
.
+ * If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
* a secondary break iterator is used to re-split the passage at the first boundary after
* maximum length.
+ *
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
* can create big outliers on semi-structured text.
*
+ *
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
+ *
+ * TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not
+ * depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses
+ * only the first matching offset to derive the start and end of each passage.
**/
public class BoundedBreakIteratorScanner extends BreakIterator {
private final BreakIterator mainBreak;
@@ -93,7 +101,15 @@ public class BoundedBreakIteratorScanner extends BreakIterator {
innerEnd = windowEnd;
} else {
windowStart = innerStart = mainBreak.preceding(offset);
- windowEnd = innerEnd = mainBreak.following(offset-1);
+ windowEnd = innerEnd = mainBreak.following(offset - 1);
+ // expand to next break until we reach maxLen
+ while (innerEnd - innerStart < maxLen) {
+ int newEnd = mainBreak.following(innerEnd);
+ if (newEnd == DONE || (newEnd - innerStart) > maxLen) {
+ break;
+ }
+ windowEnd = innerEnd = newEnd;
+ }
}
if (innerEnd - innerStart > maxLen) {
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index 6e5947d7beb..a577b5f7aff 100644
--- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -184,6 +184,20 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}
+ public void testSmallSentenceBoundedBreakIterator() throws Exception {
+ final String[] inputs = {
+ "A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence."
+ };
+ final String[] outputs = {
+ "A short sentence.",
+ "Followed by a bigger sentence",
+ "And a last short sentence"
+ };
+ TermQuery query = new TermQuery(new Term("text", "sentence"));
+ assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+ BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+ }
+
public void testRepeat() throws Exception {
final String[] inputs = {
"Fun fun fun fun fun fun fun fun fun fun"
@@ -205,4 +219,25 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}
+
+ public void testGroupSentences() throws Exception {
+ final String[] inputs = {
+ "Two words. Followed by many words in a big sentence. One. Two. Three. And more words."
+ };
+ final String[] outputs = {
+ "Two words.",
+ "Followed by many words",
+ "One. Two. Three.",
+ "And more words.",
+ };
+ BooleanQuery query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD)
+ .add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD)
+ .add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD)
+ .add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD)
+ .build();
+ assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+ BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+ }
+
}
diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
index 5861e768436..cc23deda2d8 100644
--- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
+++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -397,7 +397,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
for (int i = 0; i < indexRequestBuilders.length; i++) {
assertHighlight(search, i, "title", 0,
- equalTo("This is a test on the highlighting bug present in elasticsearch."));
+ equalTo("This is a test on the highlighting bug present in elasticsearch. Hopefully it works."));
assertHighlight(search, i, "title", 1, 2,
equalTo("This is the second bug to perform highlighting on."));
}
@@ -491,7 +491,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
- assertHighlight(searchResponse, 0, "field-postings", 0, 1, equalTo("This is the first test sentence."));
+ assertHighlight(searchResponse, 0, "field-postings", 0, 1,
+ equalTo("This is the first test sentence. Here is the second one."));
assertHighlight(searchResponse, 0, "field-fvh", 0, 1, equalTo("This is the test with term_vectors"));
assertHighlight(searchResponse, 0, "field-plain", 0, 1, equalTo("This is the test for the plain highlighter"));
}
@@ -1386,7 +1387,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(highlight().field("field0").order("score").preTags("").postTags(""));
searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet();
- assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The quick brown fox jumps over the lazy dog"));
+ assertHighlight(searchResponse, 0, "field0", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog"));
logger.info("--> highlighting and searching on field1");
source = searchSource()
@@ -1438,7 +1440,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();
- assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The quick brown fox jumps over the lazy dog"));
+ assertHighlight(searchResponse, 0, "field3", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog"));
logger.info("--> highlighting and searching on field4");
source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "the fast bro"))
@@ -1453,7 +1456,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
equalTo("The quick brown fox jumps over the lazy dog")));
logger.info("--> highlighting and searching on field4");
- source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "a fast quick blue ca"))
+ source = searchSource().postFilter(termQuery("type", "type2"))
+ .query(matchPhrasePrefixQuery("field4", "a fast quick blue ca"))
.highlighter(highlight().field("field4").order("score").preTags("").postTags(""));
searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet();
@@ -1887,33 +1891,42 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighterType("plain")
.noMatchSize(20);
SearchResponse response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first"));
+ assertHighlight(response, 0, "text", 0, 1,
+ equalTo("This is the first"));
field.highlighterType("fvh");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
+ assertHighlight(response, 0, "text", 0, 1,
+ equalTo("This is the first sentence"));
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
+ assertHighlight(response, 0, "text", 0, 1,
+ equalTo("This is the first sentence"));
//if there's a match we only return the values with matches (whole value as number_of_fragments == 0)
MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth");
field.highlighterType("plain");
response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence."));
- assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence"));
+ assertHighlight(response, 0, "text", 0, 2,
+ equalTo("This is the third sentence. This is the fourth sentence."));
+ assertHighlight(response, 0, "text", 1, 2,
+ equalTo("This is the fifth sentence"));
field.highlighterType("fvh");
response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence."));
- assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence"));
+ assertHighlight(response, 0, "text", 0, 2,
+ equalTo("This is the third sentence. This is the fourth sentence."));
+ assertHighlight(response, 0, "text", 1, 2,
+ equalTo("This is the fifth sentence"));
field.highlighterType("unified");
response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get();
- assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence."));
- assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence"));
+ assertHighlight(response, 0, "text", 0, 2,
+ equalTo("This is the third sentence. This is the fourth sentence."));
+ assertHighlight(response, 0, "text", 1, 2,
+ equalTo("This is the fifth sentence"));
}
public void testPostingsHighlighter() throws Exception {
@@ -1989,7 +2002,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
new HighlightBuilder().field(new Field("field1").preTags("<1>").postTags("1>")
.requireFieldMatch(true)))
.get();
- assertHighlight(response, 0, "field1", 0, 1, equalTo("The quick brown <1>fox1>."));
+ assertHighlight(response, 0, "field1", 0, 1,
+ equalTo("The quick brown <1>fox1>. Second sentence."));
}
public void testPostingsHighlighterNumberOfFragments() throws Exception {
@@ -2012,9 +2026,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
- assertHighlight(searchResponse, 0, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog."));
- assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog."));
- assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox."));
+ assertThat(searchResponse.getHits().getHits().length, equalTo(1));
+ assertHighlight(searchResponse, 0, "field1", 0, 2,
+ equalTo("The quick brown fox jumps over the lazy dog." +
+ " The lazy red fox jumps over the quick dog."));
+ assertHighlight(searchResponse, 0, "field1", 1, 2,
+ equalTo("The quick brown dog jumps over the lazy fox."));
client().prepareIndex("test", "type1", "2")
.setSource("field1", new String[]{
@@ -2033,14 +2050,17 @@ public class HighlighterSearchIT extends ESIntegTestCase {
for (SearchHit searchHit : searchResponse.getHits()) {
if ("1".equals(searchHit.getId())) {
- assertHighlight(searchHit, "field1", 0, 1, equalTo("The quick brown fox jumps over the lazy dog. "
+ assertHighlight(searchHit, "field1", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog. "
+ "The lazy red fox jumps over the quick dog. "
+ "The quick brown dog jumps over the lazy fox."));
} else if ("2".equals(searchHit.getId())) {
assertHighlight(searchHit, "field1", 0, 3,
equalTo("The quick brown fox jumps over the lazy dog. Second sentence not finished"));
- assertHighlight(searchHit, "field1", 1, 3, equalTo("The lazy red fox jumps over the quick dog."));
- assertHighlight(searchHit, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox."));
+ assertHighlight(searchHit, "field1", 1, 3,
+ equalTo("The lazy red fox jumps over the quick dog."));
+ assertHighlight(searchHit, "field1", 2, 3,
+ equalTo("The quick brown dog jumps over the lazy fox."));
} else {
fail("Only hits with id 1 and 2 are returned");
}
@@ -2083,7 +2103,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
logger.info("Running multi-match type: [{}] highlight with type: [{}]", matchQueryType, highlighterType);
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
assertHitCount(searchResponse, 1L);
- assertHighlight(searchResponse, 0, "field1", 0, anyOf(equalTo("The quick brown fox jumps over"),
+ assertHighlight(searchResponse, 0, "field1", 0,
+ anyOf(equalTo("The quick brown fox jumps over"),
equalTo("The quick brown fox jumps over")));
}
}
@@ -2112,13 +2133,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
Map highlightFieldMap = searchResponse.getHits().getAt(0).getHighlightFields();
assertThat(highlightFieldMap.size(), equalTo(1));
HighlightField field1 = highlightFieldMap.get("field1");
- assertThat(field1.fragments().length, equalTo(5));
+ assertThat(field1.fragments().length, equalTo(4));
assertThat(field1.fragments()[0].string(),
equalTo("This sentence contains three sentence occurrences (sentence)."));
- assertThat(field1.fragments()[1].string(), equalTo("This sentence contains two sentence matches."));
- assertThat(field1.fragments()[2].string(), equalTo("This is the second value's first sentence."));
- assertThat(field1.fragments()[3].string(), equalTo("This sentence contains one match, not that short."));
- assertThat(field1.fragments()[4].string(),
+ assertThat(field1.fragments()[1].string(),
+ equalTo("This sentence contains one match, not that short. " +
+ "This sentence contains two sentence matches."));
+ assertThat(field1.fragments()[2].string(),
+ equalTo("This is the second value's first sentence. This one contains no matches."));
+ assertThat(field1.fragments()[3].string(),
equalTo("One sentence match here and scored lower since the text is quite long, not that appealing."));
}
@@ -2139,7 +2162,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
for (int i = 0; i < indexRequestBuilders.length; i++) {
assertHighlight(searchResponse, i, "title", 0, 1,
- equalTo("This is a html escaping highlighting test for *&?"));
+ equalTo("This is a html escaping highlighting test for *&? elasticsearch"));
}
}
@@ -2173,7 +2196,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertHitCount(searchResponse, 1L);
SearchHit hit = searchResponse.getHits().getAt(0);
//stopwords are not highlighted since not indexed
- assertHighlight(hit, "title", 0, 1, equalTo("this is a test ."));
+ assertHighlight(hit, "title", 0, 1, equalTo("this is a test . Second sentence."));
// search on title.key and highlight on title
searchResponse = client().prepareSearch()
@@ -2183,7 +2206,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
//stopwords are now highlighted since we used only whitespace analyzer here
assertHighlight(searchResponse, 0, "title.key", 0, 1,
- equalTo("this is a test ."));
+ equalTo("this is a test . Second sentence."));
}
public void testPostingsHighlighterMultiMapperFromSource() throws Exception {
@@ -2258,7 +2281,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(highlight().field("field2").preTags("").postTags(""));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterCommonTermsQuery() throws IOException {
@@ -2275,7 +2299,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet();
assertHitCount(searchResponse, 1L);
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
private static XContentBuilder type1PostingsffsetsMapping() throws IOException {
@@ -2299,7 +2324,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
SearchSourceBuilder source = searchSource().query(prefixQuery("field2", "qui"))
.highlighter(highlight().field("field2"));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterFuzzyQuery() throws Exception {
@@ -2315,7 +2341,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(highlight().field("field2"));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterRegexpQuery() throws Exception {
@@ -2331,7 +2358,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(highlight().field("field2"));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterWildcardQuery() throws Exception {
@@ -2347,14 +2375,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(highlight().field("field2"));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
source = searchSource().query(wildcardQuery("field2", "qu*k"))
.highlighter(highlight().field("field2"));
searchResponse = client().prepareSearch("test").setSource(source).get();
assertHitCount(searchResponse, 1L);
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterTermRangeQuery() throws Exception {
@@ -2384,7 +2414,8 @@ public class HighlighterSearchIT extends ESIntegTestCase {
SearchSourceBuilder source = searchSource().query(queryStringQuery("qui*").defaultField("field2"))
.highlighter(highlight().field("field2"));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
- assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!"));
+ assertHighlight(searchResponse, 0, "field2", 0, 1,
+ equalTo("The quick brown fox jumps over the lazy dog! Second sentence."));
}
public void testPostingsHighlighterRegexpQueryWithinConstantScoreQuery() throws Exception {
@@ -2479,7 +2510,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertThat(searchResponse.getHits().getHits().length, equalTo(COUNT));
for (SearchHit hit : searchResponse.getHits()) {
String prefix = prefixes.get(hit.getId());
- assertHighlight(hit, "field1", 0, 1, equalTo("Sentence " + prefix + " test."));
+ assertHighlight(hit, "field1", 0, 1, equalTo("Sentence " + prefix + " test. Sentence two."));
}
}