From eeac6d27f29feb854d7f45a6f43832003ae365e2 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Fri, 24 Feb 2017 00:32:22 +0200 Subject: [PATCH] Add BreakIteratorBoundaryScanner support for FVH (#23248) This commit adds a boundary_scanner property to the search highlight request so the user can specify different boundary scanners: * `chars` (default, current behavior) * `word` Use a WordBreakIterator * `sentence` Use a SentenceBreakIterator This commit also adds "boundary_scanner_locale" to define which locale should be used when scanning the text. --- .../highlight/AbstractHighlighterBuilder.java | 87 +++++++++++++++++- .../highlight/FastVectorHighlighter.java | 41 +++++++-- .../subphase/highlight/HighlightBuilder.java | 36 +++++++- .../highlight/SearchContextHighlight.java | 30 +++++++ .../highlight/HighlightBuilderTests.java | 13 +++ .../highlight/HighlighterSearchIT.java | 89 +++++++++++++++++++ .../search/request/highlighting.asciidoc | 25 +++--- 7 files changed, 300 insertions(+), 21 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index e3a78227d9c..3a3c1cfd66d 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.elasticsearch.Version; import org.elasticsearch.action.support.ToXContentToBytes; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; @@ -32,10 +33,12 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import java.io.IOException; import java.util.Arrays; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.BiFunction; @@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilderfvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(String boundaryScannerType) { + this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType); + return (HB) this; + } + + /** + * When using the highlighterType fvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) { + this.boundaryScannerType = boundaryScannerType; + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerType(String)} + */ + public BoundaryScannerType boundaryScannerType() { + return this.boundaryScannerType; + } + /** * When using the highlighterType fvh this setting * controls how far to look for boundary characters, and defaults to 20. @@ -366,6 +420,25 @@ public abstract class AbstractHighlighterBuilderfvh and boundaryScannerType break_iterator, this setting + * controls the locale to use by the BreakIterator, defaults to "root". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerLocale(String boundaryScannerLocale) { + if (boundaryScannerLocale != null) { + this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale); + } + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerLocale(String)} + */ + public Locale boundaryScannerLocale() { + return this.boundaryScannerLocale; + } + /** * Allows to set custom options for custom highlighters. */ @@ -491,12 +564,18 @@ public abstract class AbstractHighlighterBuilder 0) { builder.field(OPTIONS_FIELD.getPreferredName(), options); } @@ -523,8 +602,10 @@ public abstract class AbstractHighlighterBuilder hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD); + parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD); parser.declareString(HB::highlighterType, TYPE_FIELD); parser.declareString(HB::fragmenter, FRAGMENTER_FIELD); parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD); @@ -562,8 +643,8 @@ public abstract class AbstractHighlighterBuilder SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); @@ -105,12 +114,7 @@ public class FastVectorHighlighter implements Highlighter { FragListBuilder fragListBuilder; BaseFragmentsBuilder fragmentsBuilder; - BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER; - if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN - || field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { - boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(), - field.fieldOptions().boundaryChars()); - } + final BoundaryScanner boundaryScanner = getBoundaryScanner(field); boolean forceSource = context.highlight().forceSource(field); if (field.fieldOptions().numberOfFragments() == 0) { fragListBuilder = new SingleFragListBuilder(); @@ -206,6 +210,29 @@ public class FastVectorHighlighter implements Highlighter { && fieldMapper.fieldType().storeTermVectorPositions(); } + private static BoundaryScanner getBoundaryScanner(Field field) { + final FieldOptions fieldOptions = field.fieldOptions(); + final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); + switch(fieldOptions.boundaryScannerType()) { + case SENTENCE: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); + } + return DEFAULT_SENTENCE_BOUNDARY_SCANNER; + case WORD: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); + } + return DEFAULT_WORD_BOUNDARY_SCANNER; + default: + if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN + || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { + return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); + } + return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + } + } + private class MapperHighlightEntry { public FragListBuilder fragListBuilder; public FragmentsBuilder fragmentsBuilder; diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index a063b2900d5..45b8c612a76 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder fields = new ArrayList<>(); @@ -327,12 +327,18 @@ public class HighlightBuilder extends AbstractHighlighterBuilder= values().length) { + throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]"); + } + return values()[ordinal]; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(this.ordinal()); + } + + public static BoundaryScannerType fromString(String boundaryScannerType) { + return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT)); + } + + @Override + public String toString() { + return name().toLowerCase(Locale.ROOT); + } + } } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java index d4731718793..2baf73ab5fa 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java @@ -20,11 +20,13 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.Query; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -110,10 +112,14 @@ public class SearchContextHighlight { private String fragmenter; + private BoundaryScannerType boundaryScannerType; + private int boundaryMaxScan = -1; private Character[] boundaryChars = null; + private Locale boundaryScannerLocale; + private Query highlightQuery; private int noMatchSize = -1; @@ -168,6 +174,10 @@ public class SearchContextHighlight { return fragmenter; } + public BoundaryScannerType boundaryScannerType() { + return boundaryScannerType; + } + public int boundaryMaxScan() { return boundaryMaxScan; } @@ -176,6 +186,10 @@ public class SearchContextHighlight { return boundaryChars; } + public Locale boundaryScannerLocale() { + return boundaryScannerLocale; + } + public Query highlightQuery() { return highlightQuery; } @@ -260,6 +274,11 @@ public class SearchContextHighlight { return this; } + Builder boundaryScannerType(BoundaryScannerType boundaryScanner) { + fieldOptions.boundaryScannerType = boundaryScanner; + return this; + } + Builder boundaryMaxScan(int boundaryMaxScan) { fieldOptions.boundaryMaxScan = boundaryMaxScan; return this; @@ -270,6 +289,11 @@ public class SearchContextHighlight { return this; } + Builder boundaryScannerLocale(Locale boundaryScannerLocale) { + fieldOptions.boundaryScannerLocale = boundaryScannerLocale; + return this; + } + Builder highlightQuery(Query highlightQuery) { fieldOptions.highlightQuery = highlightQuery; return this; @@ -324,12 +348,18 @@ public class SearchContextHighlight { if (fieldOptions.requireFieldMatch == null) { fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch; } + if (fieldOptions.boundaryScannerType == null) { + fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType; + } if (fieldOptions.boundaryMaxScan == -1) { fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan; } if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) { fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length); } + if (fieldOptions.boundaryScannerLocale == null) { + fieldOptions.boundaryScannerLocale = globalOptions.boundaryScannerLocale; + } if (fieldOptions.highlighterType == null) { fieldOptions.highlighterType = globalOptions.highlighterType; } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java index 944427b7e17..e33b201bf22 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java @@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions; @@ -288,6 +289,7 @@ public class HighlightBuilderTests extends ESTestCase { mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions); checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars); + checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType); checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan); checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize); checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter); @@ -557,12 +559,23 @@ public class HighlightBuilderTests extends ESTestCase { if (randomBoolean()) { highlightBuilder.forceSource(randomBoolean()); } + if (randomBoolean()) { + if (randomBoolean()) { + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values())); + } else { + // also test the string setter + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()).toString()); + } + } if (randomBoolean()) { highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10)); } if (randomBoolean()) { highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray()); } + if (randomBoolean()) { + highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag()); + } if (randomBoolean()) { highlightBuilder.noMatchSize(randomIntBetween(0, 10)); } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 815998ad093..7db99ff3232 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -44,6 +44,7 @@ import org.elasticsearch.plugins.Plugin; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; @@ -57,6 +58,7 @@ import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import static org.elasticsearch.client.Requests.searchRequest; @@ -747,7 +749,94 @@ public class HighlighterSearchIT extends ESIntegTestCase { searchResponse = client().prepareSearch("test").setSource(source).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); + } + public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + } + + public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + } + + public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + } + + public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); } /** diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc index 30c0e20d5bf..81f454bb158 100644 --- a/docs/reference/search/request/highlighting.asciidoc +++ b/docs/reference/search/request/highlighting.asciidoc @@ -103,8 +103,7 @@ If `term_vector` information is provided by setting `term_vector` to will be used instead of the plain highlighter. The fast vector highlighter: * Is faster especially for large fields (> `1MB`) -* Can be customized with `boundary_chars`, `boundary_max_scan`, and - `fragment_offset` (see <>) +* Can be customized with `boundary_scanner` (see <>) * Requires setting `term_vector` to `with_positions_offsets` which increases the size of the index * Can combine matches from multiple fields into one result. See @@ -502,17 +501,23 @@ GET /_search -------------------------------------------------- // CONSOLE -[[boundary-characters]] -==== Boundary Characters +[[boundary-scanners]] +==== Boundary Scanners -When highlighting a field using the fast vector highlighter, -`boundary_chars` can be configured to define what constitutes a boundary -for highlighting. It's a single string with each boundary character -defined in it. It defaults to `.,!? \t\n`. +When highlighting a field using the fast vector highlighter, you can specify +how to break the highlighted fragments using `boundary_scanner`, which accepts +the following values: -The `boundary_max_scan` allows to control how far to look for boundary -characters, and defaults to `20`. +* `chars` (default): allows to configure which characters (`boundary_chars`) +constitute a boundary for highlighting. It's a single string with each boundary +character defined in it (defaults to `.,!? \t\n`). It also allows configuring +the `boundary_max_scan` to control how far to look for boundary characters +(defaults to `20`). +* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] +to break the highlighted fragments at the next _word_ or _sentence_ boundary. +You can further specify `boundary_scanner_locale` to control which Locale is used +to search the text for these boundaries. [[matched-fields]] ==== Matched Fields