Add support for fragment_length in the unified highlighter (#23431)

* Add support for fragment_length in the unified highlighter

This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter
 that is able to limit the size of fragments produced by generic break iterator like `sentence`.
The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`.
The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`.
When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after**
 `fragment_length` is reached.
This commit is contained in:
Jim Ferenczi 2017-03-17 18:10:13 +01:00 committed by GitHub
parent c462d7d486
commit b8c352fc3f
10 changed files with 763 additions and 338 deletions

View File

@ -0,0 +1,171 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.util.Locale;
/**
* A custom break iterator that scans text to find break-delimited passages bounded by
* a provided maximum length. This class delegates the boundary search to a first level
* break iterator. When this break iterator finds a passage greater than the maximum length
* a secondary break iterator is used to re-split the passage at the first boundary after
* maximum length.
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
* can create big outliers on semi-structured text.
*
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
**/
public class BoundedBreakIteratorScanner extends BreakIterator {
private final BreakIterator mainBreak;
private final BreakIterator innerBreak;
private final int maxLen;
private int lastPrecedingOffset = -1;
private int windowStart = -1;
private int windowEnd = -1;
private int innerStart = -1;
private int innerEnd = 0;
private BoundedBreakIteratorScanner(BreakIterator mainBreak,
BreakIterator innerBreak,
int maxLen) {
this.mainBreak = mainBreak;
this.innerBreak = innerBreak;
this.maxLen = maxLen;
}
@Override
public CharacterIterator getText() {
return mainBreak.getText();
}
@Override
public void setText(CharacterIterator newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}
@Override
public void setText(String newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}
private void reset() {
lastPrecedingOffset = -1;
windowStart = -1;
windowEnd = -1;
innerStart = -1;
innerEnd = 0;
}
/**
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
*/
@Override
public int preceding(int offset) {
if (offset < lastPrecedingOffset) {
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
if (offset > windowStart && offset < windowEnd) {
innerStart = innerEnd;
innerEnd = windowEnd;
} else {
windowStart = innerStart = mainBreak.preceding(offset);
windowEnd = innerEnd = mainBreak.following(offset-1);
}
if (innerEnd - innerStart > maxLen) {
// the current split is too big,
// so starting from the current term we try to find boundaries on the left first
if (offset - maxLen > innerStart) {
innerStart = Math.max(innerStart,
innerBreak.preceding(offset - maxLen));
}
// and then we try to expand the passage to the right with the remaining size
int remaining = Math.max(0, maxLen - (offset - innerStart));
if (offset + remaining < windowEnd) {
innerEnd = Math.min(windowEnd,
innerBreak.following(offset + remaining));
}
}
lastPrecedingOffset = offset - 1;
return innerStart;
}
/**
* Can be invoked only after a call to preceding(offset+1).
* See {@link FieldHighlighter} for usage.
*/
@Override
public int following(int offset) {
if (offset != lastPrecedingOffset || innerEnd == -1) {
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
return innerEnd;
}
/**
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
*/
public static BreakIterator getSentence(Locale locale, int maxLen) {
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
}
@Override
public int current() {
// Returns the last offset of the current split
return this.innerEnd;
}
@Override
public int first() {
throw new IllegalStateException("first() should not be called in this context");
}
@Override
public int next() {
throw new IllegalStateException("next() should not be called in this context");
}
@Override
public int last() {
throw new IllegalStateException("last() should not be called in this context");
}
@Override
public int next(int n) {
throw new IllegalStateException("next(n) should not be called in this context");
}
@Override
public int previous() {
throw new IllegalStateException("previous() should not be called in this context");
}
}

View File

@ -0,0 +1,79 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.util.Locale;
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
/**
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
* no highlights were found.
*/
class CustomFieldHighlighter extends FieldHighlighter {
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
private final Locale breakIteratorLocale;
private final int noMatchSize;
private final String fieldValue;
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
Locale breakIteratorLocale, BreakIterator breakIterator,
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
maxNoHighlightPassages, passageFormatter);
this.breakIteratorLocale = breakIteratorLocale;
this.noMatchSize = noMatchSize;
this.fieldValue = fieldValue;
}
@Override
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
if (noMatchSize > 0) {
int pos = 0;
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
pos ++;
}
if (pos < fieldValue.length()) {
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
if (end == -1) {
end = fieldValue.length();
}
if (noMatchSize+pos < end) {
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
bi.setText(fieldValue);
// Finds the next word boundary **after** noMatchSize.
end = bi.following(noMatchSize + pos);
if (end == BreakIterator.DONE) {
end = fieldValue.length();
}
}
Passage passage = new Passage();
passage.setScore(Float.NaN);
passage.setStartOffset(pos);
passage.setEndOffset(end);
return new Passage[]{passage};
}
}
return EMPTY_PASSAGE;
}
}

View File

@ -33,6 +33,8 @@ import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
@ -47,6 +49,7 @@ import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
/**
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
@ -57,12 +60,14 @@ import java.util.Map;
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
*/
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
public static final char MULTIVAL_SEP_CHAR = (char) 0;
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
private final String fieldValue;
private final PassageFormatter passageFormatter;
private final BreakIterator breakIterator;
private final boolean returnNonHighlightedSnippets;
private final Locale breakIteratorLocale;
private final int noMatchSize;
/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
@ -70,24 +75,26 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
* If null {@link Locale#ROOT} is used
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
* the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
* returned rather than empty snippets when no highlighting can be performed
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
*/
public CustomUnifiedHighlighter(IndexSearcher searcher,
Analyzer analyzer,
PassageFormatter passageFormatter,
@Nullable Locale breakIteratorLocale,
@Nullable BreakIterator breakIterator,
String fieldValue,
boolean returnNonHighlightedSnippets) {
int noMatchSize) {
super(searcher, analyzer);
this.breakIterator = breakIterator;
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
this.passageFormatter = passageFormatter;
this.fieldValue = fieldValue;
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
this.noMatchSize = noMatchSize;
}
/**
@ -111,17 +118,14 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
@Override
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
int cacheCharsThreshold) throws IOException {
//we only highlight one field, one document at a time
// we only highlight one field, one document at a time
return Collections.singletonList(new String[]{fieldValue});
}
@Override
protected BreakIterator getBreakIterator(String field) {
if (breakIterator != null) {
return breakIterator;
}
return super.getBreakIterator(field);
}
@Override
protected PassageFormatter getFormatter(String field) {
@ -129,11 +133,18 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
}
@Override
protected int getMaxNoHighlightPassages(String field) {
if (returnNonHighlightedSnippets) {
return 1;
}
return 0;
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy =
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
@Override
@ -146,7 +157,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
return rewriteCustomQuery(query);
}
/**
* Translate custom queries in queries that are supported by the unified highlighter.
*/

View File

@ -52,13 +52,14 @@ import java.util.Map;
public class FastVectorHighlighter implements Highlighter {
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getSentenceInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getWordInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER =
new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT));
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE =
Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope);
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
true, Setting.Property.NodeScope);
private static final String CACHE_KEY = "highlight-fsv";
private final Boolean termVectorMultiValue;
@ -74,11 +75,12 @@ public class FastVectorHighlighter implements Highlighter {
FieldMapper mapper = highlighterContext.mapper;
if (canHighlight(mapper) == false) {
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
+ "] should be indexed with term vector with position offsets to be used with fast vector highlighter");
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName +
"] should be indexed with term vector with position offsets to be used with fast vector highlighter");
}
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
Encoder encoder = field.fieldOptions().encoder().equals("html") ?
HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
@ -90,21 +92,21 @@ public class FastVectorHighlighter implements Highlighter {
if (field.fieldOptions().requireFieldMatch()) {
if (cache.fieldMatchFieldQuery == null) {
/*
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
* readers...)
* we use top level reader to rewrite the query against all readers,
* with use caching it across hits (and across readers...)
*/
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
true, field.fieldOptions().requireFieldMatch());
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
}
fieldQuery = cache.fieldMatchFieldQuery;
} else {
if (cache.noFieldMatchFieldQuery == null) {
/*
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
* readers...)
* we use top level reader to rewrite the query against all readers,
* with use caching it across hits (and across readers...)
*/
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
true, field.fieldOptions().requireFieldMatch());
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
}
fieldQuery = cache.noFieldMatchFieldQuery;
}
@ -142,7 +144,8 @@ public class FastVectorHighlighter implements Highlighter {
fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(),
field.fieldOptions().postTags(), boundaryScanner);
} else {
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
fragmentsBuilder =
new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
field.fieldOptions().postTags(), boundaryScanner);
}
}
@ -153,8 +156,8 @@ public class FastVectorHighlighter implements Highlighter {
entry.fragmentsBuilder = fragmentsBuilder;
if (cache.fvh == null) {
// parameters to FVH are not requires since:
// first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch)
// fragment builders are used explicitly
// first two booleans are not relevant since they are set on the CustomFieldQuery
// (phrase and fieldMatch) fragment builders are used explicitly
cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
}
CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
@ -172,13 +175,14 @@ public class FastVectorHighlighter implements Highlighter {
// we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
// Only send matched fields if they were requested to save time.
if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) {
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
} else {
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize,
numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
field.fieldOptions().postTags(), encoder);
} else {
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
}
if (fragments != null && fragments.length > 0) {
@ -187,11 +191,13 @@ public class FastVectorHighlighter implements Highlighter {
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
if (noMatchSize > 0) {
// Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder
// Essentially we just request that a fragment is built from 0 to noMatchSize using
// the normal fragmentsBuilder
FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
fieldFragList.add(0, noMatchSize, Collections.<WeightedPhraseInfo>emptyList());
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(),
mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(),
field.fieldOptions().postTags(), encoder);
if (fragments != null && fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
@ -200,7 +206,8 @@ public class FastVectorHighlighter implements Highlighter {
return null;
} catch (Exception e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
throw new FetchPhaseExecutionException(context,
"Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
}
@ -212,8 +219,13 @@ public class FastVectorHighlighter implements Highlighter {
private static BoundaryScanner getBoundaryScanner(Field field) {
final FieldOptions fieldOptions = field.fieldOptions();
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
switch(fieldOptions.boundaryScannerType()) {
final Locale boundaryScannerLocale =
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
Locale.ROOT;
final HighlightBuilder.BoundaryScannerType type =
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
HighlightBuilder.BoundaryScannerType.CHARS;
switch(type) {
case SENTENCE:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
@ -224,12 +236,14 @@ public class FastVectorHighlighter implements Highlighter {
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
}
return DEFAULT_WORD_BOUNDARY_SCANNER;
default:
case CHARS:
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
}
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
default:
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
}
}

View File

@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
.boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
@ -34,12 +35,15 @@ import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
@ -93,19 +97,22 @@ public class UnifiedHighlighter implements Highlighter {
// we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
new org.apache.lucene.search.postingshighlight
.CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
highlighter =
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue,
field.fieldOptions().noMatchSize());
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
BreakIterator bi = getBreakIterator(field);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0);
mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi,
fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = field.fieldOptions().numberOfFragments();
}
if (field.fieldOptions().requireFieldMatch()) {
@ -144,11 +151,34 @@ public class UnifiedHighlighter implements Highlighter {
return null;
}
static class HighlighterEntry {
private BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions();
final Locale locale =
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
Locale.ROOT;
final HighlightBuilder.BoundaryScannerType type =
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
HighlightBuilder.BoundaryScannerType.SENTENCE;
int maxLen = fieldOptions.fragmentCharSize();
switch (type) {
case SENTENCE:
if (maxLen > 0) {
return BoundedBreakIteratorScanner.getSentence(locale, maxLen);
}
return BreakIterator.getSentenceInstance(locale);
case WORD:
// ignore maxLen
return BreakIterator.getWordInstance(locale);
default:
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
}
}
private static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
}
static class MapperHighlighterEntry {
private static class MapperHighlighterEntry {
final CustomPassageFormatter passageFormatter;
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {

View File

@ -0,0 +1,138 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.elasticsearch.test.ESTestCase;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
public class BoundedBreakIteratorScannerTests extends ESTestCase {
private static final String[] WORD_BOUNDARIES =
new String[] { " ", " ", "\t", "#", "\n" };
private static final String[] SENTENCE_BOUNDARIES =
new String[] { "! ", "? ", ". ", ".\n", ".\n\n" };
private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
// Generate a random set of unique terms with ascii character
int maxSize = randomIntBetween(5, 100);
String[] vocabulary = new String[maxSize];
for (int i = 0; i < maxSize; i++) {
if (rarely()) {
vocabulary[i] = randomAsciiOfLengthBetween(50, 200);
} else {
vocabulary[i] = randomAsciiOfLengthBetween(1, 30);
}
}
// Generate a random text made of random terms separated with word-boundaries
// and sentence-boundaries.
StringBuilder text = new StringBuilder();
List<Integer> offsetList = new ArrayList<> ();
List<Integer> sizeList = new ArrayList<> ();
// the number of sentences to generate
int numSentences = randomIntBetween(10, 100);
int maxTermLen = 0;
for (int i = 0; i < numSentences; i++) {
// the number of terms in the sentence
int numTerms = randomIntBetween(5, 10);
for (int j = 0; j < numTerms; j++) {
int termId = randomIntBetween(0, vocabulary.length - 1);
String term = vocabulary[termId].toLowerCase(Locale.ROOT);
if (j == 0) {
// capitalize the first letter of the first term in the sentence
term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1);
} else {
String sep = randomFrom(WORD_BOUNDARIES);
text.append(sep);
}
maxTermLen = Math.max(term.length(), maxTermLen);
offsetList.add(text.length());
sizeList.add(term.length());
text.append(term);
}
String boundary = randomFrom(SENTENCE_BOUNDARIES);
text.append(boundary);
}
int[] sizes = sizeList.stream().mapToInt(i->i).toArray();
int[] offsets = offsetList.stream().mapToInt(i->i).toArray();
bi.setText(text.toString());
int currentPos = randomIntBetween(0, 20);
int lastEnd = -1;
int maxPassageLen = maxLen+(maxTermLen*2);
while (currentPos < offsets.length) {
// find the passage that contains the current term
int nextOffset = offsets[currentPos];
int start = bi.preceding(nextOffset+1);
int end = bi.following(nextOffset);
// check that the passage is valid
assertThat(start, greaterThanOrEqualTo(lastEnd));
assertThat(end, greaterThan(start));
assertThat(start, lessThanOrEqualTo(nextOffset));
assertThat(end, greaterThanOrEqualTo(nextOffset));
int passageLen = end-start;
assertThat(passageLen, lessThanOrEqualTo(maxPassageLen));
// checks that the start and end of the passage are on word boundaries.
int startPos = Arrays.binarySearch(offsets, start);
int endPos = Arrays.binarySearch(offsets, end);
if (startPos < 0) {
int lastWordEnd =
offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
assertThat(start, greaterThanOrEqualTo(lastWordEnd));
}
if (endPos < 0) {
if (Math.abs(endPos)-2 < offsets.length) {
int lastWordEnd =
offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
assertThat(end, greaterThanOrEqualTo(lastWordEnd));
}
// advance the position to the end of the current passage
currentPos = (Math.abs(endPos) - 1);
} else {
// advance the position to the end of the current passage
currentPos = endPos;
}
// randomly advance to the next term to highlight
currentPos += randomIntBetween(0, 20);
lastEnd = end;
}
}
public void testBoundedSentence() {
for (int i = 0; i < 20; i++) {
int maxLen = randomIntBetween(10, 500);
testRandomAsciiTextCase(
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
maxLen
);
}
}
}

View File

@ -20,20 +20,22 @@
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
@ -41,219 +43,167 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Locale;
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
import static org.hamcrest.CoreMatchers.equalTo;
public class CustomUnifiedHighlighterTests extends ESTestCase {
public void testCustomUnifiedHighlighter() throws Exception {
private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query,
Locale locale, BreakIterator breakIterator,
int noMatchSize, String[] expectedPassages) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
offsetsType.setStoreTermVectorOffsets(true);
offsetsType.setStoreTermVectorPositions(true);
offsetsType.setStoreTermVectors(true);
//good position but only one match
final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter.";
Field body = new Field("body", "", offsetsType);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.freeze();
Document doc = new Document();
doc.add(body);
body.setStringValue(firstValue);
//two matches, not the best snippet due to its length though
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " +
"that gets scored lower.";
Field body2 = new Field("body", "", offsetsType);
doc.add(body2);
body2.setStringValue(secondValue);
//two matches and short, will be scored highest
final String thirdValue = "This is highlighting the third short highlighting value.";
Field body3 = new Field("body", "", offsetsType);
doc.add(body3);
body3.setStringValue(thirdValue);
//one match, same as first but at the end, will be scored lower due to its position
final String fourthValue = "Just a test4 highlighting from unified highlighter.";
Field body4 = new Field("body", "", offsetsType);
doc.add(body4);
body4.setStringValue(fourthValue);
for (String input : inputs) {
Field field = new Field(fieldName, "", ft);
field.setStringValue(input);
doc.add(field);
}
iw.addDocument(doc);
IndexReader ir = iw.getReader();
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
String firstHlValue = "Just a test1 <b>highlighting</b> from unified highlighter.";
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
" longer text that gets scored lower.";
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
String fourthHlValue = "Just a test4 <b>highlighting</b> from unified highlighter.";
IndexSearcher searcher = newSearcher(ir);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue +
HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(),
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), null, fieldValue, true);
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(4));
assertThat(snippets[0].getText(), equalTo(firstHlValue));
assertThat(snippets[1].getText(), equalTo(secondHlValue));
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
ir.close();
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue,
noMatchSize);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets =
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
assertEquals(snippets.length, expectedPassages.length);
for (int i = 0; i < snippets.length; i++) {
assertEquals(snippets[i].getText(), expectedPassages[i]);
}
reader.close();
dir.close();
}
public void testSimple() throws Exception {
final String[] inputs = {
"This is a test. Just a test1 highlighting from unified highlighter.",
"This is the second highlighting value to perform highlighting on a longer text that gets scored lower.",
"This is highlighting the third short highlighting value.",
"Just a test4 highlighting from unified highlighter."
};
String[] expectedPassages = {
"Just a test1 <b>highlighting</b> from unified highlighter.",
"This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
" longer text that gets scored lower.",
"This is <b>highlighting</b> the third short <b>highlighting</b> value.",
"Just a test4 <b>highlighting</b> from unified highlighter."
};
Query query = new TermQuery(new Term("text", "highlighting"));
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages);
}
public void testNoMatchSize() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
offsetsType.setStoreTermVectorOffsets(true);
offsetsType.setStoreTermVectorPositions(true);
offsetsType.setStoreTermVectors(true);
Field body = new Field("body", "", offsetsType);
Field none = new Field("none", "", offsetsType);
Document doc = new Document();
doc.add(body);
doc.add(none);
String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore.";
body.setStringValue(firstValue);
none.setStringValue(firstValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
Query query = new TermQuery(new Term("none", "highlighting"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter,
null, firstValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(0));
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true);
snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("This is a test."));
ir.close();
dir.close();
}
private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field textField = new Field(field, "", ft);
Document doc = new Document();
doc.add(textField);
textField.setStringValue(value);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
return ir;
final String[] inputs = {
"This is a test. Just a test highlighting from unified. Feel free to ignore."
};
Query query = new TermQuery(new Term("body", "highlighting"));
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs);
}
public void testMultiPhrasePrefixQuery() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
Directory dir = newDirectory();
String value = "The quick brown fox.";
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
final String[] inputs = {
"The quick brown fox."
};
final String[] outputs = {
"The <b>quick</b> <b>brown</b> <b>fox</b>."
};
MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
query.add(new Term("text", "quick"));
query.add(new Term("text", "brown"));
query.add(new Term("text", "fo"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
ir.close();
dir.close();
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}
public void testAllTermQuery() throws IOException {
Directory dir = newDirectory();
String value = "The quick brown fox.";
Analyzer analyzer = new StandardAnalyzer();
IndexReader ir = indexOneDoc(dir, "all", value, analyzer);
AllTermQuery query = new AllTermQuery(new Term("all", "fox"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("all", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The quick brown <b>fox</b>."));
ir.close();
dir.close();
public void testAllTermQuery() throws Exception {
final String[] inputs = {
"The quick brown fox."
};
final String[] outputs = {
"The quick brown <b>fox</b>."
};
AllTermQuery query = new AllTermQuery(new Term("text", "fox"));
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}
public void testCommonTermsQuery() throws IOException {
Directory dir = newDirectory();
String value = "The quick brown fox.";
Analyzer analyzer = new StandardAnalyzer();
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
public void testCommonTermsQuery() throws Exception {
final String[] inputs = {
"The quick brown fox."
};
final String[] outputs = {
"The <b>quick</b> <b>brown</b> <b>fox</b>."
};
CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
query.add(new Term("text", "quick"));
query.add(new Term("text", "brown"));
query.add(new Term("text", "fox"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
ir.close();
dir.close();
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}
public void testSentenceBoundedBreakIterator() throws Exception {
final String[] inputs = {
"The quick brown fox in a long sentence with another quick brown fox. " +
"Another sentence with brown fox."
};
final String[] outputs = {
"The <b>quick</b> <b>brown</b>",
"<b>fox</b> in a long",
"with another <b>quick</b>",
"<b>brown</b> <b>fox</b>.",
"sentence with <b>brown</b>",
"<b>fox</b>.",
};
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD)
.build();
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}
public void testRepeat() throws Exception {
final String[] inputs = {
"Fun fun fun fun fun fun fun fun fun fun"
};
final String[] outputs = {
"<b>Fun</b> <b>fun</b> <b>fun</b>",
"<b>fun</b> <b>fun</b>",
"<b>fun</b> <b>fun</b> <b>fun</b>",
"<b>fun</b> <b>fun</b>"
};
Query query = new TermQuery(new Term("text", "fun"));
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
query = new PhraseQuery.Builder()
.add(new Term("text", "fun"))
.add(new Term("text", "fun"))
.build();
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}
}

View File

@ -751,52 +751,69 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
}
public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
public void testHighlighterWithSentenceBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
for (String type : new String[] {"unified", "fvh"}) {
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.field("field1", 21, 2)
.highlighterType(type)
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
equalTo("A <xxx>sentence</xxx> with few words"),
equalTo("A <xxx>sentence</xxx> with few words. ")
));
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
equalTo("Another <xxx>sentence</xxx> with"),
equalTo("Another <xxx>sentence</xxx> with even more words. ")
));
}
}
public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
for (String type : new String[] {"fvh", "unified"}) {
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "sentence"))
.highlighter(highlight()
.field("field1", 20, 2)
.order("score")
.field("field1", 21, 2)
.highlighterType(type)
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.SENTENCE)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
equalTo("A <xxx>sentence</xxx> with few words"),
equalTo("A <xxx>sentence</xxx> with few words. ")
));
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
equalTo("Another <xxx>sentence</xxx> with"),
equalTo("Another <xxx>sentence</xxx> with even more words. ")
));
}
}
public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
public void testHighlighterWithWordBoundaryScanner() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
@ -804,39 +821,48 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
for (String type : new String[] {"unified", "fvh"}) {
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.highlighterType(type)
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
equalTo("<xxx>some</xxx> quick and hairy brown"),
equalTo("<xxx>some</xxx>")
));
}
}
public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1")
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
for (String type : new String[] {"unified", "fvh"}) {
SearchSourceBuilder source = searchSource()
.query(termQuery("field1", "some"))
.highlighter(highlight()
.field("field1", 23, 1)
.order("score")
.highlighterType(type)
.preTags("<xxx>").postTags("</xxx>")
.boundaryScannerType(BoundaryScannerType.WORD)
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
equalTo("<xxx>some</xxx> quick and hairy brown"),
equalTo("<xxx>some</xxx>")
));
}
}
/**
@ -1841,16 +1867,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
// Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
// Postings hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// Unified hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// We can also ask for a fragment longer than the input string and get the whole string
field.highlighterType("plain").noMatchSize(text.length() * 2);
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
@ -1860,16 +1886,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// We can also ask for a fragment exactly the size of the input field and get the whole field
field.highlighterType("plain").noMatchSize(text.length());
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
@ -1879,16 +1904,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
// unified hl returns the first sentence as the noMatchSize does not cross sentence boundary.
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// You can set noMatchSize globally in the highlighter as well
field.highlighterType("plain").noMatchSize(null);
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
@ -1898,12 +1923,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// We don't break if noMatchSize is less than zero though
@ -1947,16 +1972,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
// Postings hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// Unified hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
// And noMatchSize returns nothing when the first entry is empty string!
index("test", "type1", "2", "text", new String[] {"", text2});
refresh();
@ -1980,11 +2004,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(new HighlightBuilder().field(field)).get();
assertNotHighlighted(response, 0, "text");
// except for the unified highlighter which starts from the first string with actual content
field.highlighterType("unified");
response = client().prepareSearch("test")
.setQuery(idsQueryBuilder)
.highlighter(new HighlightBuilder().field(field)).get();
assertNotHighlighted(response, 0, "text");
assertHighlight(response, 0, "text", 0, 1, equalTo("I am short"));
// But if the field was actually empty then you should get no highlighting field
index("test", "type1", "3", "text", new String[] {});
@ -2031,7 +2056,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
.highlighter(new HighlightBuilder().field(field)).get();
assertNotHighlighted(response, 0, "text");
field.highlighterType("fvh");
field.highlighterType("unified");
response = client().prepareSearch("test")
.setQuery(idsQueryBuilder)
.highlighter(new HighlightBuilder().field(field)).get();
@ -2081,16 +2106,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
// Postings hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("postings");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
// Unified hl also works but the fragment is the whole first sentence (size ignored)
field.highlighterType("unified");
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
//if there's a match we only return the values with matches (whole value as number_of_fragments == 0)
MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth");
field.highlighterType("plain");

View File

@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a
* `highlight_query`
* `pre_tags and `post_tags`
* `require_field_match`
* `boundary_scanner` (`sentence` (**default**) or `word`)
* `max_fragment_length` (only for `sentence` scanner)
* `no_match_size`
==== Force highlighter type
@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from.
In the case where there is no matching fragment to highlight, the default is
to not return anything. Instead, we can return a snippet of text from the
beginning of the field by setting `no_match_size` (default `0`) to the length
of the text that you want returned. The actual length may be shorter than
of the text that you want returned. The actual length may be shorter or longer than
specified as it tries to break on a word boundary. When using the postings
highlighter it is not possible to control the actual size of the snippet,
therefore the first sentence gets returned whenever `no_match_size` is
@ -504,21 +507,26 @@ GET /_search
[[boundary-scanners]]
==== Boundary Scanners
When highlighting a field using the fast vector highlighter, you can specify
how to break the highlighted fragments using `boundary_scanner`, which accepts
When highlighting a field using the unified highlighter or the fast vector highlighter,
you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts
the following values:
* `chars` (default): allows to configure which characters (`boundary_chars`)
* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`)
constitute a boundary for highlighting. It's a single string with each boundary
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
the `boundary_max_scan` to control how far to look for boundary characters
(defaults to `20`).
(defaults to `20`). Works only with the Fast Vector Highlighter.
* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
to break the highlighted fragments at the next _word_ or _sentence_ boundary.
* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
to break the highlighted fragments at the next _sentence_ or _word_ boundary.
You can further specify `boundary_scanner_locale` to control which Locale is used
to search the text for these boundaries.
[NOTE]
When used with the `unified` highlighter, the `sentence` scanner splits sentence
bigger than `fragment_size` at the first word boundary next to `fragment_size`.
You can set `fragment_size` to 0 to never split any sentence.
[[matched-fields]]
==== Matched Fields
The Fast Vector Highlighter can combine matches on multiple fields to