Add BreakIteratorBoundaryScanner support for FVH (#23248)

This commit adds a boundary_scanner property to the search highlight
request so the user can specify different boundary scanners:

* `chars` (default,  current behavior)
* `word` Use a WordBreakIterator
* `sentence` Use a SentenceBreakIterator

This commit also adds "boundary_scanner_locale" to define which locale
should be used when scanning the text.
This commit is contained in:
Shai Erera 2017-02-24 00:32:22 +02:00 committed by Jim Ferenczi
parent 25a9a7ee3a
commit eeac6d27f2
7 changed files with 300 additions and 21 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.elasticsearch.Version;
import org.elasticsearch.action.support.ToXContentToBytes; import org.elasticsearch.action.support.ToXContentToBytes;
import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.ParsingException;
@ -32,10 +33,12 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.function.BiFunction; import java.util.function.BiFunction;
@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments"); public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments");
public static final ParseField ENCODER_FIELD = new ParseField("encoder"); public static final ParseField ENCODER_FIELD = new ParseField("encoder");
public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match"); public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match");
public static final ParseField BOUNDARY_SCANNER_FIELD = new ParseField("boundary_scanner");
public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan"); public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan");
public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars"); public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars");
public static final ParseField BOUNDARY_SCANNER_LOCALE_FIELD = new ParseField("boundary_scanner_locale");
public static final ParseField TYPE_FIELD = new ParseField("type"); public static final ParseField TYPE_FIELD = new ParseField("type");
public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter"); public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter");
public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size"); public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size");
@ -88,10 +93,14 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
protected Boolean forceSource; protected Boolean forceSource;
protected BoundaryScannerType boundaryScannerType;
protected Integer boundaryMaxScan; protected Integer boundaryMaxScan;
protected char[] boundaryChars; protected char[] boundaryChars;
protected Locale boundaryScannerLocale;
protected Integer noMatchSize; protected Integer noMatchSize;
protected Integer phraseLimit; protected Integer phraseLimit;
@ -119,10 +128,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
order(in.readOptionalWriteable(Order::readFromStream)); order(in.readOptionalWriteable(Order::readFromStream));
highlightFilter(in.readOptionalBoolean()); highlightFilter(in.readOptionalBoolean());
forceSource(in.readOptionalBoolean()); forceSource(in.readOptionalBoolean());
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream));
}
boundaryMaxScan(in.readOptionalVInt()); boundaryMaxScan(in.readOptionalVInt());
if (in.readBoolean()) { if (in.readBoolean()) {
boundaryChars(in.readString().toCharArray()); boundaryChars(in.readString().toCharArray());
} }
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
if (in.readBoolean()) {
boundaryScannerLocale(in.readString());
}
}
noMatchSize(in.readOptionalVInt()); noMatchSize(in.readOptionalVInt());
phraseLimit(in.readOptionalVInt()); phraseLimit(in.readOptionalVInt());
if (in.readBoolean()) { if (in.readBoolean()) {
@ -150,12 +167,22 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
out.writeOptionalWriteable(order); out.writeOptionalWriteable(order);
out.writeOptionalBoolean(highlightFilter); out.writeOptionalBoolean(highlightFilter);
out.writeOptionalBoolean(forceSource); out.writeOptionalBoolean(forceSource);
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
out.writeOptionalWriteable(boundaryScannerType);
}
out.writeOptionalVInt(boundaryMaxScan); out.writeOptionalVInt(boundaryMaxScan);
boolean hasBounaryChars = boundaryChars != null; boolean hasBounaryChars = boundaryChars != null;
out.writeBoolean(hasBounaryChars); out.writeBoolean(hasBounaryChars);
if (hasBounaryChars) { if (hasBounaryChars) {
out.writeString(String.valueOf(boundaryChars)); out.writeString(String.valueOf(boundaryChars));
} }
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boolean hasBoundaryScannerLocale = boundaryScannerLocale != null;
out.writeBoolean(hasBoundaryScannerLocale);
if (hasBoundaryScannerLocale) {
out.writeString(boundaryScannerLocale.toLanguageTag());
}
}
out.writeOptionalVInt(noMatchSize); out.writeOptionalVInt(noMatchSize);
out.writeOptionalVInt(phraseLimit); out.writeOptionalVInt(phraseLimit);
boolean hasOptions = options != null; boolean hasOptions = options != null;
@ -331,6 +358,33 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
return this.highlightFilter; return this.highlightFilter;
} }
/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(String boundaryScannerType) {
this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType);
return (HB) this;
}
/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) {
this.boundaryScannerType = boundaryScannerType;
return (HB) this;
}
/**
* @return the value set by {@link #boundaryScannerType(String)}
*/
public BoundaryScannerType boundaryScannerType() {
return this.boundaryScannerType;
}
/** /**
* When using the highlighterType <tt>fvh</tt> this setting * When using the highlighterType <tt>fvh</tt> this setting
* controls how far to look for boundary characters, and defaults to 20. * controls how far to look for boundary characters, and defaults to 20.
@ -366,6 +420,25 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
return this.boundaryChars; return this.boundaryChars;
} }
/**
* When using the highlighterType <tt>fvh</tt> and boundaryScannerType <tt>break_iterator</tt>, this setting
* controls the locale to use by the BreakIterator, defaults to "root".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerLocale(String boundaryScannerLocale) {
if (boundaryScannerLocale != null) {
this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale);
}
return (HB) this;
}
/**
* @return the value set by {@link #boundaryScannerLocale(String)}
*/
public Locale boundaryScannerLocale() {
return this.boundaryScannerLocale;
}
/** /**
* Allows to set custom options for custom highlighters. * Allows to set custom options for custom highlighters.
*/ */
@ -491,12 +564,18 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
if (highlightFilter != null) { if (highlightFilter != null) {
builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter); builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter);
} }
if (boundaryScannerType != null) {
builder.field(BOUNDARY_SCANNER_FIELD.getPreferredName(), boundaryScannerType.name());
}
if (boundaryMaxScan != null) { if (boundaryMaxScan != null) {
builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan); builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan);
} }
if (boundaryChars != null) { if (boundaryChars != null) {
builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars)); builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars));
} }
if (boundaryScannerLocale != null) {
builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag());
}
if (options != null && options.size() > 0) { if (options != null && options.size() > 0) {
builder.field(OPTIONS_FIELD.getPreferredName(), options); builder.field(OPTIONS_FIELD.getPreferredName(), options);
} }
@ -523,8 +602,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD); parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD);
parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD); parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD);
parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD); parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD);
parser.declareString(HB::boundaryScannerType, BOUNDARY_SCANNER_FIELD);
parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD); parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD);
parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD); parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD);
parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD);
parser.declareString(HB::highlighterType, TYPE_FIELD); parser.declareString(HB::highlighterType, TYPE_FIELD);
parser.declareString(HB::fragmenter, FRAGMENTER_FIELD); parser.declareString(HB::fragmenter, FRAGMENTER_FIELD);
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD); parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
@ -562,8 +643,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public final int hashCode() { public final int hashCode() {
return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize, return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize,
numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter, numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter,
forceSource, boundaryMaxScan, Arrays.hashCode(boundaryChars), noMatchSize, forceSource, boundaryScannerType, boundaryMaxScan, Arrays.hashCode(boundaryChars), boundaryScannerLocale,
phraseLimit, options, requireFieldMatch, doHashCode()); noMatchSize, phraseLimit, options, requireFieldMatch, doHashCode());
} }
/** /**
@ -591,8 +672,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
Objects.equals(order, other.order) && Objects.equals(order, other.order) &&
Objects.equals(highlightFilter, other.highlightFilter) && Objects.equals(highlightFilter, other.highlightFilter) &&
Objects.equals(forceSource, other.forceSource) && Objects.equals(forceSource, other.forceSource) &&
Objects.equals(boundaryScannerType, other.boundaryScannerType) &&
Objects.equals(boundaryMaxScan, other.boundaryMaxScan) && Objects.equals(boundaryMaxScan, other.boundaryMaxScan) &&
Arrays.equals(boundaryChars, other.boundaryChars) && Arrays.equals(boundaryChars, other.boundaryChars) &&
Objects.equals(boundaryScannerLocale, other.boundaryScannerLocale) &&
Objects.equals(noMatchSize, other.noMatchSize) && Objects.equals(noMatchSize, other.noMatchSize) &&
Objects.equals(phraseLimit, other.phraseLimit) && Objects.equals(phraseLimit, other.phraseLimit) &&
Objects.equals(options, other.options) && Objects.equals(options, other.options) &&

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder; import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery; import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldFragList; import org.apache.lucene.search.vectorhighlight.FieldFragList;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
@ -38,15 +39,23 @@ import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.internal.SearchContext;
import java.text.BreakIterator;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
public class FastVectorHighlighter implements Highlighter { public class FastVectorHighlighter implements Highlighter {
private static final SimpleBoundaryScanner DEFAULT_BOUNDARY_SCANNER = new SimpleBoundaryScanner(); private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getSentenceInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getWordInstance(Locale.ROOT));
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
true, Setting.Property.NodeScope); true, Setting.Property.NodeScope);
@ -105,12 +114,7 @@ public class FastVectorHighlighter implements Highlighter {
FragListBuilder fragListBuilder; FragListBuilder fragListBuilder;
BaseFragmentsBuilder fragmentsBuilder; BaseFragmentsBuilder fragmentsBuilder;
BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER; final BoundaryScanner boundaryScanner = getBoundaryScanner(field);
if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(),
field.fieldOptions().boundaryChars());
}
boolean forceSource = context.highlight().forceSource(field); boolean forceSource = context.highlight().forceSource(field);
if (field.fieldOptions().numberOfFragments() == 0) { if (field.fieldOptions().numberOfFragments() == 0) {
fragListBuilder = new SingleFragListBuilder(); fragListBuilder = new SingleFragListBuilder();
@ -206,6 +210,29 @@ public class FastVectorHighlighter implements Highlighter {
&& fieldMapper.fieldType().storeTermVectorPositions(); && fieldMapper.fieldType().storeTermVectorPositions();
} }
private static BoundaryScanner getBoundaryScanner(Field field) {
final FieldOptions fieldOptions = field.fieldOptions();
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
switch(fieldOptions.boundaryScannerType()) {
case SENTENCE:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
}
return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
case WORD:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
}
return DEFAULT_WORD_BOUNDARY_SCANNER;
default:
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
}
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
}
}
private class MapperHighlightEntry { private class MapperHighlightEntry {
public FragListBuilder fragListBuilder; public FragListBuilder fragListBuilder;
public FragmentsBuilder fragmentsBuilder; public FragmentsBuilder fragmentsBuilder;

View File

@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED) .preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH) .highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE) .forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER) .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) .boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
.noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build(); .boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
private final List<Field> fields = new ArrayList<>(); private final List<Field> fields = new ArrayList<>();
@ -327,12 +327,18 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
if (highlighterBuilder.requireFieldMatch != null) { if (highlighterBuilder.requireFieldMatch != null) {
targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch); targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch);
} }
if (highlighterBuilder.boundaryScannerType != null) {
targetOptionsBuilder.boundaryScannerType(highlighterBuilder.boundaryScannerType);
}
if (highlighterBuilder.boundaryMaxScan != null) { if (highlighterBuilder.boundaryMaxScan != null) {
targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan); targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan);
} }
if (highlighterBuilder.boundaryChars != null) { if (highlighterBuilder.boundaryChars != null) {
targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars)); targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars));
} }
if (highlighterBuilder.boundaryScannerLocale != null) {
targetOptionsBuilder.boundaryScannerLocale(highlighterBuilder.boundaryScannerLocale);
}
if (highlighterBuilder.highlighterType != null) { if (highlighterBuilder.highlighterType != null) {
targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType); targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType);
} }
@ -522,4 +528,30 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
return name().toLowerCase(Locale.ROOT); return name().toLowerCase(Locale.ROOT);
} }
} }
public enum BoundaryScannerType implements Writeable {
CHARS, WORD, SENTENCE;
public static BoundaryScannerType readFromStream(StreamInput in) throws IOException {
int ordinal = in.readVInt();
if (ordinal < 0 || ordinal >= values().length) {
throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]");
}
return values()[ordinal];
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(this.ordinal());
}
public static BoundaryScannerType fromString(String boundaryScannerType) {
return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT));
}
@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
} }

View File

@ -20,11 +20,13 @@
package org.elasticsearch.search.fetch.subphase.highlight; package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -110,10 +112,14 @@ public class SearchContextHighlight {
private String fragmenter; private String fragmenter;
private BoundaryScannerType boundaryScannerType;
private int boundaryMaxScan = -1; private int boundaryMaxScan = -1;
private Character[] boundaryChars = null; private Character[] boundaryChars = null;
private Locale boundaryScannerLocale;
private Query highlightQuery; private Query highlightQuery;
private int noMatchSize = -1; private int noMatchSize = -1;
@ -168,6 +174,10 @@ public class SearchContextHighlight {
return fragmenter; return fragmenter;
} }
public BoundaryScannerType boundaryScannerType() {
return boundaryScannerType;
}
public int boundaryMaxScan() { public int boundaryMaxScan() {
return boundaryMaxScan; return boundaryMaxScan;
} }
@ -176,6 +186,10 @@ public class SearchContextHighlight {
return boundaryChars; return boundaryChars;
} }
public Locale boundaryScannerLocale() {
return boundaryScannerLocale;
}
public Query highlightQuery() { public Query highlightQuery() {
return highlightQuery; return highlightQuery;
} }
@ -260,6 +274,11 @@ public class SearchContextHighlight {
return this; return this;
} }
Builder boundaryScannerType(BoundaryScannerType boundaryScanner) {
fieldOptions.boundaryScannerType = boundaryScanner;
return this;
}
Builder boundaryMaxScan(int boundaryMaxScan) { Builder boundaryMaxScan(int boundaryMaxScan) {
fieldOptions.boundaryMaxScan = boundaryMaxScan; fieldOptions.boundaryMaxScan = boundaryMaxScan;
return this; return this;
@ -270,6 +289,11 @@ public class SearchContextHighlight {
return this; return this;
} }
Builder boundaryScannerLocale(Locale boundaryScannerLocale) {
fieldOptions.boundaryScannerLocale = boundaryScannerLocale;
return this;
}
Builder highlightQuery(Query highlightQuery) { Builder highlightQuery(Query highlightQuery) {
fieldOptions.highlightQuery = highlightQuery; fieldOptions.highlightQuery = highlightQuery;
return this; return this;
@ -324,12 +348,18 @@ public class SearchContextHighlight {
if (fieldOptions.requireFieldMatch == null) { if (fieldOptions.requireFieldMatch == null) {
fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch; fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch;
} }
if (fieldOptions.boundaryScannerType == null) {
fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType;
}
if (fieldOptions.boundaryMaxScan == -1) { if (fieldOptions.boundaryMaxScan == -1) {
fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan; fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan;
} }
if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) { if (fieldOptions.boundaryChars == null && globalOptions.boundaryChars != null) {
fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length); fieldOptions.boundaryChars = Arrays.copyOf(globalOptions.boundaryChars, globalOptions.boundaryChars.length);
} }
if (fieldOptions.boundaryScannerLocale == null) {
fieldOptions.boundaryScannerLocale = globalOptions.boundaryScannerLocale;
}
if (fieldOptions.highlighterType == null) { if (fieldOptions.highlighterType == null) {
fieldOptions.highlighterType = globalOptions.highlighterType; fieldOptions.highlighterType = globalOptions.highlighterType;
} }

View File

@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchModule; import org.elasticsearch.search.SearchModule;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions; import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
@ -288,6 +289,7 @@ public class HighlightBuilderTests extends ESTestCase {
mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions); mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions);
checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars); checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars);
checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType);
checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan); checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan);
checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize); checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize);
checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter); checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter);
@ -557,12 +559,23 @@ public class HighlightBuilderTests extends ESTestCase {
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.forceSource(randomBoolean()); highlightBuilder.forceSource(randomBoolean());
} }
if (randomBoolean()) {
if (randomBoolean()) {
highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()));
} else {
// also test the string setter
highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values()).toString());
}
}
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10)); highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10));
} }
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray()); highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray());
} }
if (randomBoolean()) {
highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag());
}
if (randomBoolean()) { if (randomBoolean()) {
highlightBuilder.noMatchSize(randomIntBetween(0, 10)); highlightBuilder.noMatchSize(randomIntBetween(0, 10));
} }

View File

@ -44,6 +44,7 @@ import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.rest.RestStatus; import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
@ -57,6 +58,7 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import static org.elasticsearch.client.Requests.searchRequest; import static org.elasticsearch.client.Requests.searchRequest;
@ -747,7 +749,94 @@ public class HighlighterSearchIT extends ESIntegTestCase {
searchResponse = client().prepareSearch("test").setSource(source).get(); searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over")); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
}
public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
}
public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
}
public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
}
public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
} }
/** /**

View File

@ -103,8 +103,7 @@ If `term_vector` information is provided by setting `term_vector` to
will be used instead of the plain highlighter. The fast vector highlighter: will be used instead of the plain highlighter. The fast vector highlighter:
* Is faster especially for large fields (> `1MB`) * Is faster especially for large fields (> `1MB`)
* Can be customized with `boundary_chars`, `boundary_max_scan`, and * Can be customized with `boundary_scanner` (see <<boundary-scanners,below>>)
`fragment_offset` (see <<boundary-characters,below>>)
* Requires setting `term_vector` to `with_positions_offsets` which * Requires setting `term_vector` to `with_positions_offsets` which
increases the size of the index increases the size of the index
* Can combine matches from multiple fields into one result. See * Can combine matches from multiple fields into one result. See
@ -502,17 +501,23 @@ GET /_search
-------------------------------------------------- --------------------------------------------------
// CONSOLE // CONSOLE
[[boundary-characters]] [[boundary-scanners]]
==== Boundary Characters ==== Boundary Scanners
When highlighting a field using the fast vector highlighter, When highlighting a field using the fast vector highlighter, you can specify
`boundary_chars` can be configured to define what constitutes a boundary how to break the highlighted fragments using `boundary_scanner`, which accepts
for highlighting. It's a single string with each boundary character the following values:
defined in it. It defaults to `.,!? \t\n`.
The `boundary_max_scan` allows to control how far to look for boundary * `chars` (default): allows to configure which characters (`boundary_chars`)
characters, and defaults to `20`. constitute a boundary for highlighting. It's a single string with each boundary
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
the `boundary_max_scan` to control how far to look for boundary characters
(defaults to `20`).
* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
to break the highlighted fragments at the next _word_ or _sentence_ boundary.
You can further specify `boundary_scanner_locale` to control which Locale is used
to search the text for these boundaries.
[[matched-fields]] [[matched-fields]]
==== Matched Fields ==== Matched Fields