diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java index 1cd5fb9340d..cd40046f738 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java @@ -23,15 +23,23 @@ import java.text.CharacterIterator; import java.util.Locale; /** - * A custom break iterator that scans text to find break-delimited passages bounded by - * a provided maximum length. This class delegates the boundary search to a first level - * break iterator. When this break iterator finds a passage greater than the maximum length + * A custom break iterator that is used to find break-delimited passages bounded by + * a provided maximum length in the {@link UnifiedHighlighter} context. + * This class uses a {@link BreakIterator} to find the last break after the provided offset + * that would create a passage smaller than maxLen. + * If the {@link BreakIterator} cannot find a passage smaller than the maximum length, * a secondary break iterator is used to re-split the passage at the first boundary after * maximum length. + * * This is useful to split passages created by {@link BreakIterator}s like `sentence` that * can create big outliers on semi-structured text. * + * * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}. + * + * TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not + * depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses + * only the first matching offset to derive the start and end of each passage. **/ public class BoundedBreakIteratorScanner extends BreakIterator { private final BreakIterator mainBreak; @@ -93,7 +101,15 @@ public class BoundedBreakIteratorScanner extends BreakIterator { innerEnd = windowEnd; } else { windowStart = innerStart = mainBreak.preceding(offset); - windowEnd = innerEnd = mainBreak.following(offset-1); + windowEnd = innerEnd = mainBreak.following(offset - 1); + // expand to next break until we reach maxLen + while (innerEnd - innerStart < maxLen) { + int newEnd = mainBreak.following(innerEnd); + if (newEnd == DONE || (newEnd - innerStart) > maxLen) { + break; + } + windowEnd = innerEnd = newEnd; + } } if (innerEnd - innerStart > maxLen) { diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 6e5947d7beb..a577b5f7aff 100644 --- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -184,6 +184,20 @@ public class CustomUnifiedHighlighterTests extends ESTestCase { BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); } + public void testSmallSentenceBoundedBreakIterator() throws Exception { + final String[] inputs = { + "A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence." + }; + final String[] outputs = { + "A short sentence.", + "Followed by a bigger sentence", + "And a last short sentence" + }; + TermQuery query = new TermQuery(new Term("text", "sentence")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs); + } + public void testRepeat() throws Exception { final String[] inputs = { "Fun fun fun fun fun fun fun fun fun fun" @@ -205,4 +219,25 @@ public class CustomUnifiedHighlighterTests extends ESTestCase { assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); } + + public void testGroupSentences() throws Exception { + final String[] inputs = { + "Two words. Followed by many words in a big sentence. One. Two. Three. And more words." + }; + final String[] outputs = { + "Two words.", + "Followed by many words", + "One. Two. Three.", + "And more words.", + }; + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD) + .build(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs); + } + } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 5861e768436..cc23deda2d8 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -397,7 +397,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { for (int i = 0; i < indexRequestBuilders.length; i++) { assertHighlight(search, i, "title", 0, - equalTo("This is a test on the highlighting bug present in elasticsearch.")); + equalTo("This is a test on the highlighting bug present in elasticsearch. Hopefully it works.")); assertHighlight(search, i, "title", 1, 2, equalTo("This is the second bug to perform highlighting on.")); } @@ -491,7 +491,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field-postings", 0, 1, equalTo("This is the first test sentence.")); + assertHighlight(searchResponse, 0, "field-postings", 0, 1, + equalTo("This is the first test sentence. Here is the second one.")); assertHighlight(searchResponse, 0, "field-fvh", 0, 1, equalTo("This is the test with term_vectors")); assertHighlight(searchResponse, 0, "field-plain", 0, 1, equalTo("This is the test for the plain highlighter")); } @@ -1386,7 +1387,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(highlight().field("field0").order("score").preTags("").postTags("")); searchResponse = client().search(searchRequest("first_test_index").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The quick brown fox jumps over the lazy dog")); + assertHighlight(searchResponse, 0, "field0", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog")); logger.info("--> highlighting and searching on field1"); source = searchSource() @@ -1438,7 +1440,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The quick brown fox jumps over the lazy dog")); + assertHighlight(searchResponse, 0, "field3", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog")); logger.info("--> highlighting and searching on field4"); source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "the fast bro")) @@ -1453,7 +1456,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { equalTo("The quick brown fox jumps over the lazy dog"))); logger.info("--> highlighting and searching on field4"); - source = searchSource().postFilter(termQuery("type", "type2")).query(matchPhrasePrefixQuery("field4", "a fast quick blue ca")) + source = searchSource().postFilter(termQuery("type", "type2")) + .query(matchPhrasePrefixQuery("field4", "a fast quick blue ca")) .highlighter(highlight().field("field4").order("score").preTags("").postTags("")); searchResponse = client().search(searchRequest("second_test_index").source(source)).actionGet(); @@ -1887,33 +1891,42 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighterType("plain") .noMatchSize(20); SearchResponse response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first")); + assertHighlight(response, 0, "text", 0, 1, + equalTo("This is the first")); field.highlighterType("fvh"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); + assertHighlight(response, 0, "text", 0, 1, + equalTo("This is the first sentence")); field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); + assertHighlight(response, 0, "text", 0, 1, + equalTo("This is the first sentence")); //if there's a match we only return the values with matches (whole value as number_of_fragments == 0) MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth"); field.highlighterType("plain"); response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence.")); - assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence")); + assertHighlight(response, 0, "text", 0, 2, + equalTo("This is the third sentence. This is the fourth sentence.")); + assertHighlight(response, 0, "text", 1, 2, + equalTo("This is the fifth sentence")); field.highlighterType("fvh"); response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence.")); - assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence")); + assertHighlight(response, 0, "text", 0, 2, + equalTo("This is the third sentence. This is the fourth sentence.")); + assertHighlight(response, 0, "text", 1, 2, + equalTo("This is the fifth sentence")); field.highlighterType("unified"); response = client().prepareSearch("test").setQuery(queryBuilder).highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence.")); - assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence")); + assertHighlight(response, 0, "text", 0, 2, + equalTo("This is the third sentence. This is the fourth sentence.")); + assertHighlight(response, 0, "text", 1, 2, + equalTo("This is the fifth sentence")); } public void testPostingsHighlighter() throws Exception { @@ -1989,7 +2002,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { new HighlightBuilder().field(new Field("field1").preTags("<1>").postTags("") .requireFieldMatch(true))) .get(); - assertHighlight(response, 0, "field1", 0, 1, equalTo("The quick brown <1>fox.")); + assertHighlight(response, 0, "field1", 0, 1, + equalTo("The quick brown <1>fox. Second sentence.")); } public void testPostingsHighlighterNumberOfFragments() throws Exception { @@ -2012,9 +2026,12 @@ public class HighlighterSearchIT extends ESIntegTestCase { SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); + assertThat(searchResponse.getHits().getHits().length, equalTo(1)); + assertHighlight(searchResponse, 0, "field1", 0, 2, + equalTo("The quick brown fox jumps over the lazy dog." + + " The lazy red fox jumps over the quick dog.")); + assertHighlight(searchResponse, 0, "field1", 1, 2, + equalTo("The quick brown dog jumps over the lazy fox.")); client().prepareIndex("test", "type1", "2") .setSource("field1", new String[]{ @@ -2033,14 +2050,17 @@ public class HighlighterSearchIT extends ESIntegTestCase { for (SearchHit searchHit : searchResponse.getHits()) { if ("1".equals(searchHit.getId())) { - assertHighlight(searchHit, "field1", 0, 1, equalTo("The quick brown fox jumps over the lazy dog. " + assertHighlight(searchHit, "field1", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog. " + "The lazy red fox jumps over the quick dog. " + "The quick brown dog jumps over the lazy fox.")); } else if ("2".equals(searchHit.getId())) { assertHighlight(searchHit, "field1", 0, 3, equalTo("The quick brown fox jumps over the lazy dog. Second sentence not finished")); - assertHighlight(searchHit, "field1", 1, 3, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchHit, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); + assertHighlight(searchHit, "field1", 1, 3, + equalTo("The lazy red fox jumps over the quick dog.")); + assertHighlight(searchHit, "field1", 2, 3, + equalTo("The quick brown dog jumps over the lazy fox.")); } else { fail("Only hits with id 1 and 2 are returned"); } @@ -2083,7 +2103,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { logger.info("Running multi-match type: [{}] highlight with type: [{}]", matchQueryType, highlighterType); SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); assertHitCount(searchResponse, 1L); - assertHighlight(searchResponse, 0, "field1", 0, anyOf(equalTo("The quick brown fox jumps over"), + assertHighlight(searchResponse, 0, "field1", 0, + anyOf(equalTo("The quick brown fox jumps over"), equalTo("The quick brown fox jumps over"))); } } @@ -2112,13 +2133,15 @@ public class HighlighterSearchIT extends ESIntegTestCase { Map highlightFieldMap = searchResponse.getHits().getAt(0).getHighlightFields(); assertThat(highlightFieldMap.size(), equalTo(1)); HighlightField field1 = highlightFieldMap.get("field1"); - assertThat(field1.fragments().length, equalTo(5)); + assertThat(field1.fragments().length, equalTo(4)); assertThat(field1.fragments()[0].string(), equalTo("This sentence contains three sentence occurrences (sentence).")); - assertThat(field1.fragments()[1].string(), equalTo("This sentence contains two sentence matches.")); - assertThat(field1.fragments()[2].string(), equalTo("This is the second value's first sentence.")); - assertThat(field1.fragments()[3].string(), equalTo("This sentence contains one match, not that short.")); - assertThat(field1.fragments()[4].string(), + assertThat(field1.fragments()[1].string(), + equalTo("This sentence contains one match, not that short. " + + "This sentence contains two sentence matches.")); + assertThat(field1.fragments()[2].string(), + equalTo("This is the second value's first sentence. This one contains no matches.")); + assertThat(field1.fragments()[3].string(), equalTo("One sentence match here and scored lower since the text is quite long, not that appealing.")); } @@ -2139,7 +2162,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { for (int i = 0; i < indexRequestBuilders.length; i++) { assertHighlight(searchResponse, i, "title", 0, 1, - equalTo("This is a html escaping highlighting test for *&?")); + equalTo("This is a html escaping highlighting test for *&? elasticsearch")); } } @@ -2173,7 +2196,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { assertHitCount(searchResponse, 1L); SearchHit hit = searchResponse.getHits().getAt(0); //stopwords are not highlighted since not indexed - assertHighlight(hit, "title", 0, 1, equalTo("this is a test .")); + assertHighlight(hit, "title", 0, 1, equalTo("this is a test . Second sentence.")); // search on title.key and highlight on title searchResponse = client().prepareSearch() @@ -2183,7 +2206,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { //stopwords are now highlighted since we used only whitespace analyzer here assertHighlight(searchResponse, 0, "title.key", 0, 1, - equalTo("this is a test .")); + equalTo("this is a test . Second sentence.")); } public void testPostingsHighlighterMultiMapperFromSource() throws Exception { @@ -2258,7 +2281,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(highlight().field("field2").preTags("").postTags("")); SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterCommonTermsQuery() throws IOException { @@ -2275,7 +2299,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); assertHitCount(searchResponse, 1L); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } private static XContentBuilder type1PostingsffsetsMapping() throws IOException { @@ -2299,7 +2324,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { SearchSourceBuilder source = searchSource().query(prefixQuery("field2", "qui")) .highlighter(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterFuzzyQuery() throws Exception { @@ -2315,7 +2341,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterRegexpQuery() throws Exception { @@ -2331,7 +2358,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterWildcardQuery() throws Exception { @@ -2347,14 +2375,16 @@ public class HighlighterSearchIT extends ESIntegTestCase { .highlighter(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); source = searchSource().query(wildcardQuery("field2", "qu*k")) .highlighter(highlight().field("field2")); searchResponse = client().prepareSearch("test").setSource(source).get(); assertHitCount(searchResponse, 1L); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterTermRangeQuery() throws Exception { @@ -2384,7 +2414,8 @@ public class HighlighterSearchIT extends ESIntegTestCase { SearchSourceBuilder source = searchSource().query(queryStringQuery("qui*").defaultField("field2")) .highlighter(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); + assertHighlight(searchResponse, 0, "field2", 0, 1, + equalTo("The quick brown fox jumps over the lazy dog! Second sentence.")); } public void testPostingsHighlighterRegexpQueryWithinConstantScoreQuery() throws Exception { @@ -2479,7 +2510,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { assertThat(searchResponse.getHits().getHits().length, equalTo(COUNT)); for (SearchHit hit : searchResponse.getHits()) { String prefix = prefixes.get(hit.getId()); - assertHighlight(hit, "field1", 0, 1, equalTo("Sentence " + prefix + " test.")); + assertHighlight(hit, "field1", 0, 1, equalTo("Sentence " + prefix + " test. Sentence two.")); } }