Add support for fragment_length in the unified highlighter (#23431)
* Add support for fragment_length in the unified highlighter This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter that is able to limit the size of fragments produced by generic break iterator like `sentence`. The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`. The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`. When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after** `fragment_length` is reached.
This commit is contained in:
parent
c462d7d486
commit
b8c352fc3f
|
@ -0,0 +1,171 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* A custom break iterator that scans text to find break-delimited passages bounded by
|
||||
* a provided maximum length. This class delegates the boundary search to a first level
|
||||
* break iterator. When this break iterator finds a passage greater than the maximum length
|
||||
* a secondary break iterator is used to re-split the passage at the first boundary after
|
||||
* maximum length.
|
||||
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
|
||||
* can create big outliers on semi-structured text.
|
||||
*
|
||||
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
|
||||
**/
|
||||
public class BoundedBreakIteratorScanner extends BreakIterator {
|
||||
private final BreakIterator mainBreak;
|
||||
private final BreakIterator innerBreak;
|
||||
private final int maxLen;
|
||||
|
||||
private int lastPrecedingOffset = -1;
|
||||
private int windowStart = -1;
|
||||
private int windowEnd = -1;
|
||||
private int innerStart = -1;
|
||||
private int innerEnd = 0;
|
||||
|
||||
private BoundedBreakIteratorScanner(BreakIterator mainBreak,
|
||||
BreakIterator innerBreak,
|
||||
int maxLen) {
|
||||
this.mainBreak = mainBreak;
|
||||
this.innerBreak = innerBreak;
|
||||
this.maxLen = maxLen;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return mainBreak.getText();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator newText) {
|
||||
reset();
|
||||
mainBreak.setText(newText);
|
||||
innerBreak.setText(newText);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(String newText) {
|
||||
reset();
|
||||
mainBreak.setText(newText);
|
||||
innerBreak.setText(newText);
|
||||
}
|
||||
|
||||
private void reset() {
|
||||
lastPrecedingOffset = -1;
|
||||
windowStart = -1;
|
||||
windowEnd = -1;
|
||||
innerStart = -1;
|
||||
innerEnd = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
|
||||
*/
|
||||
@Override
|
||||
public int preceding(int offset) {
|
||||
if (offset < lastPrecedingOffset) {
|
||||
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
|
||||
"usage doesn't look like UnifiedHighlighter");
|
||||
}
|
||||
if (offset > windowStart && offset < windowEnd) {
|
||||
innerStart = innerEnd;
|
||||
innerEnd = windowEnd;
|
||||
} else {
|
||||
windowStart = innerStart = mainBreak.preceding(offset);
|
||||
windowEnd = innerEnd = mainBreak.following(offset-1);
|
||||
}
|
||||
|
||||
if (innerEnd - innerStart > maxLen) {
|
||||
// the current split is too big,
|
||||
// so starting from the current term we try to find boundaries on the left first
|
||||
if (offset - maxLen > innerStart) {
|
||||
innerStart = Math.max(innerStart,
|
||||
innerBreak.preceding(offset - maxLen));
|
||||
}
|
||||
// and then we try to expand the passage to the right with the remaining size
|
||||
int remaining = Math.max(0, maxLen - (offset - innerStart));
|
||||
if (offset + remaining < windowEnd) {
|
||||
innerEnd = Math.min(windowEnd,
|
||||
innerBreak.following(offset + remaining));
|
||||
}
|
||||
}
|
||||
lastPrecedingOffset = offset - 1;
|
||||
return innerStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can be invoked only after a call to preceding(offset+1).
|
||||
* See {@link FieldHighlighter} for usage.
|
||||
*/
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
if (offset != lastPrecedingOffset || innerEnd == -1) {
|
||||
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
|
||||
"usage doesn't look like UnifiedHighlighter");
|
||||
}
|
||||
return innerEnd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
|
||||
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
|
||||
*/
|
||||
public static BreakIterator getSentence(Locale locale, int maxLen) {
|
||||
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
|
||||
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
|
||||
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
// Returns the last offset of the current split
|
||||
return this.innerEnd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
throw new IllegalStateException("first() should not be called in this context");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
throw new IllegalStateException("next() should not be called in this context");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
throw new IllegalStateException("last() should not be called in this context");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
throw new IllegalStateException("next(n) should not be called in this context");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
throw new IllegalStateException("previous() should not be called in this context");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
|
||||
/**
|
||||
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
|
||||
* no highlights were found.
|
||||
*/
|
||||
class CustomFieldHighlighter extends FieldHighlighter {
|
||||
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
|
||||
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
private final String fieldValue;
|
||||
|
||||
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
|
||||
Locale breakIteratorLocale, BreakIterator breakIterator,
|
||||
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
|
||||
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
|
||||
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
|
||||
maxNoHighlightPassages, passageFormatter);
|
||||
this.breakIteratorLocale = breakIteratorLocale;
|
||||
this.noMatchSize = noMatchSize;
|
||||
this.fieldValue = fieldValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
|
||||
if (noMatchSize > 0) {
|
||||
int pos = 0;
|
||||
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
|
||||
pos ++;
|
||||
}
|
||||
if (pos < fieldValue.length()) {
|
||||
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
|
||||
if (end == -1) {
|
||||
end = fieldValue.length();
|
||||
}
|
||||
if (noMatchSize+pos < end) {
|
||||
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
|
||||
bi.setText(fieldValue);
|
||||
// Finds the next word boundary **after** noMatchSize.
|
||||
end = bi.following(noMatchSize + pos);
|
||||
if (end == BreakIterator.DONE) {
|
||||
end = fieldValue.length();
|
||||
}
|
||||
}
|
||||
Passage passage = new Passage();
|
||||
passage.setScore(Float.NaN);
|
||||
passage.setStartOffset(pos);
|
||||
passage.setEndOffset(end);
|
||||
return new Passage[]{passage};
|
||||
}
|
||||
}
|
||||
return EMPTY_PASSAGE;
|
||||
}
|
||||
}
|
|
@ -33,6 +33,8 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
|||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||
|
@ -47,6 +49,7 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
|
||||
|
@ -57,37 +60,41 @@ import java.util.Map;
|
|||
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
|
||||
*/
|
||||
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||
public static final char MULTIVAL_SEP_CHAR = (char) 0;
|
||||
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
|
||||
|
||||
private final String fieldValue;
|
||||
private final PassageFormatter passageFormatter;
|
||||
private final BreakIterator breakIterator;
|
||||
private final boolean returnNonHighlightedSnippets;
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
||||
*
|
||||
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
|
||||
* @param passageFormatter our own {@link CustomPassageFormatter}
|
||||
* which generates snippets in forms of {@link Snippet} objects
|
||||
* which generates snippets in forms of {@link Snippet} objects
|
||||
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
|
||||
* If null {@link Locale#ROOT} is used
|
||||
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
|
||||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
|
||||
* the relevant stored field.
|
||||
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
|
||||
* returned rather than empty snippets when no highlighting can be performed
|
||||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
|
||||
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
|
||||
*/
|
||||
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
||||
Analyzer analyzer,
|
||||
PassageFormatter passageFormatter,
|
||||
@Nullable Locale breakIteratorLocale,
|
||||
@Nullable BreakIterator breakIterator,
|
||||
String fieldValue,
|
||||
boolean returnNonHighlightedSnippets) {
|
||||
int noMatchSize) {
|
||||
super(searcher, analyzer);
|
||||
this.breakIterator = breakIterator;
|
||||
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
|
||||
this.passageFormatter = passageFormatter;
|
||||
this.fieldValue = fieldValue;
|
||||
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
|
||||
this.noMatchSize = noMatchSize;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -111,16 +118,13 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
@Override
|
||||
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
||||
int cacheCharsThreshold) throws IOException {
|
||||
//we only highlight one field, one document at a time
|
||||
// we only highlight one field, one document at a time
|
||||
return Collections.singletonList(new String[]{fieldValue});
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
if (breakIterator != null) {
|
||||
return breakIterator;
|
||||
}
|
||||
return super.getBreakIterator(field);
|
||||
return breakIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -129,11 +133,18 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected int getMaxNoHighlightPassages(String field) {
|
||||
if (returnNonHighlightedSnippets) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
|
||||
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
|
||||
Set<HighlightFlag> highlightFlags = getFlags(field);
|
||||
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
|
||||
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
|
||||
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
|
||||
FieldOffsetStrategy strategy =
|
||||
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
|
||||
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
|
||||
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -146,7 +157,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
return rewriteCustomQuery(query);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Translate custom queries in queries that are supported by the unified highlighter.
|
||||
*/
|
||||
|
|
|
@ -52,13 +52,14 @@ import java.util.Map;
|
|||
public class FastVectorHighlighter implements Highlighter {
|
||||
|
||||
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
|
||||
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
|
||||
BreakIterator.getWordInstance(Locale.ROOT));
|
||||
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
|
||||
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER =
|
||||
new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT));
|
||||
|
||||
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE =
|
||||
Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope);
|
||||
|
||||
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
|
||||
true, Setting.Property.NodeScope);
|
||||
private static final String CACHE_KEY = "highlight-fsv";
|
||||
private final Boolean termVectorMultiValue;
|
||||
|
||||
|
@ -74,11 +75,12 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
FieldMapper mapper = highlighterContext.mapper;
|
||||
|
||||
if (canHighlight(mapper) == false) {
|
||||
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
|
||||
+ "] should be indexed with term vector with position offsets to be used with fast vector highlighter");
|
||||
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName +
|
||||
"] should be indexed with term vector with position offsets to be used with fast vector highlighter");
|
||||
}
|
||||
|
||||
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
||||
Encoder encoder = field.fieldOptions().encoder().equals("html") ?
|
||||
HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
||||
|
||||
if (!hitContext.cache().containsKey(CACHE_KEY)) {
|
||||
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
|
||||
|
@ -90,21 +92,21 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
if (field.fieldOptions().requireFieldMatch()) {
|
||||
if (cache.fieldMatchFieldQuery == null) {
|
||||
/*
|
||||
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
|
||||
* readers...)
|
||||
* we use top level reader to rewrite the query against all readers,
|
||||
* with use caching it across hits (and across readers...)
|
||||
*/
|
||||
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
|
||||
true, field.fieldOptions().requireFieldMatch());
|
||||
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
|
||||
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
|
||||
}
|
||||
fieldQuery = cache.fieldMatchFieldQuery;
|
||||
} else {
|
||||
if (cache.noFieldMatchFieldQuery == null) {
|
||||
/*
|
||||
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
|
||||
* readers...)
|
||||
* we use top level reader to rewrite the query against all readers,
|
||||
* with use caching it across hits (and across readers...)
|
||||
*/
|
||||
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
|
||||
true, field.fieldOptions().requireFieldMatch());
|
||||
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
|
||||
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
|
||||
}
|
||||
fieldQuery = cache.noFieldMatchFieldQuery;
|
||||
}
|
||||
|
@ -128,7 +130,7 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
}
|
||||
} else {
|
||||
fragListBuilder = field.fieldOptions().fragmentOffset() == -1 ?
|
||||
new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset());
|
||||
new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset());
|
||||
if (field.fieldOptions().scoreOrdered()) {
|
||||
if (!forceSource && mapper.fieldType().stored()) {
|
||||
fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.fieldOptions().preTags(),
|
||||
|
@ -142,7 +144,8 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(),
|
||||
field.fieldOptions().postTags(), boundaryScanner);
|
||||
} else {
|
||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
|
||||
fragmentsBuilder =
|
||||
new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
|
||||
field.fieldOptions().postTags(), boundaryScanner);
|
||||
}
|
||||
}
|
||||
|
@ -153,8 +156,8 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
entry.fragmentsBuilder = fragmentsBuilder;
|
||||
if (cache.fvh == null) {
|
||||
// parameters to FVH are not requires since:
|
||||
// first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch)
|
||||
// fragment builders are used explicitly
|
||||
// first two booleans are not relevant since they are set on the CustomFieldQuery
|
||||
// (phrase and fieldMatch) fragment builders are used explicitly
|
||||
cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
|
||||
}
|
||||
CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
|
||||
|
@ -172,13 +175,14 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
// we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
|
||||
// Only send matched fields if they were requested to save time.
|
||||
if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) {
|
||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
||||
field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
|
||||
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
|
||||
mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize,
|
||||
numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
|
||||
field.fieldOptions().postTags(), encoder);
|
||||
} else {
|
||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
||||
fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
|
||||
field.fieldOptions().postTags(), encoder);
|
||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
|
||||
mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
|
||||
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
||||
}
|
||||
|
||||
if (fragments != null && fragments.length > 0) {
|
||||
|
@ -187,11 +191,13 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
|
||||
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
|
||||
if (noMatchSize > 0) {
|
||||
// Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder
|
||||
// Essentially we just request that a fragment is built from 0 to noMatchSize using
|
||||
// the normal fragmentsBuilder
|
||||
FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
|
||||
fieldFragList.add(0, noMatchSize, Collections.<WeightedPhraseInfo>emptyList());
|
||||
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
||||
fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
||||
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(),
|
||||
mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(),
|
||||
field.fieldOptions().postTags(), encoder);
|
||||
if (fragments != null && fragments.length > 0) {
|
||||
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
|
||||
}
|
||||
|
@ -200,7 +206,8 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
return null;
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
|
||||
throw new FetchPhaseExecutionException(context,
|
||||
"Failed to highlight field [" + highlighterContext.fieldName + "]", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -212,24 +219,31 @@ public class FastVectorHighlighter implements Highlighter {
|
|||
|
||||
private static BoundaryScanner getBoundaryScanner(Field field) {
|
||||
final FieldOptions fieldOptions = field.fieldOptions();
|
||||
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
|
||||
switch(fieldOptions.boundaryScannerType()) {
|
||||
case SENTENCE:
|
||||
if (boundaryScannerLocale != null) {
|
||||
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
|
||||
}
|
||||
return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
|
||||
case WORD:
|
||||
if (boundaryScannerLocale != null) {
|
||||
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
|
||||
}
|
||||
return DEFAULT_WORD_BOUNDARY_SCANNER;
|
||||
default:
|
||||
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|
||||
final Locale boundaryScannerLocale =
|
||||
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
|
||||
Locale.ROOT;
|
||||
final HighlightBuilder.BoundaryScannerType type =
|
||||
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
|
||||
HighlightBuilder.BoundaryScannerType.CHARS;
|
||||
switch(type) {
|
||||
case SENTENCE:
|
||||
if (boundaryScannerLocale != null) {
|
||||
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
|
||||
}
|
||||
return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
|
||||
case WORD:
|
||||
if (boundaryScannerLocale != null) {
|
||||
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
|
||||
}
|
||||
return DEFAULT_WORD_BOUNDARY_SCANNER;
|
||||
case CHARS:
|
||||
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|
||||
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
|
||||
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
|
||||
}
|
||||
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
|
||||
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
|
||||
}
|
||||
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
|
||||
default:
|
||||
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
|
|||
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
|
||||
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
|
||||
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
|
||||
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
|
||||
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
|
||||
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
|
||||
.boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
|
||||
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
|
||||
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -34,12 +35,15 @@ import org.elasticsearch.search.fetch.FetchSubPhase;
|
|||
import org.elasticsearch.search.internal.SearchContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
|
||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
|
||||
|
||||
|
@ -93,19 +97,22 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
// we use a control char to separate values, which is the only char that the custom break iterator
|
||||
// breaks the text on, so we don't lose the distinction between the different values of a field and we
|
||||
// get back a snippet per value
|
||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
|
||||
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
|
||||
new org.apache.lucene.search.postingshighlight
|
||||
.CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
|
||||
.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||
highlighter =
|
||||
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
|
||||
breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue,
|
||||
field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
|
||||
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||
BreakIterator bi = getBreakIterator(field);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||
mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0);
|
||||
mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi,
|
||||
fieldValue, field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
}
|
||||
if (field.fieldOptions().requireFieldMatch()) {
|
||||
|
@ -144,11 +151,34 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
return null;
|
||||
}
|
||||
|
||||
static class HighlighterEntry {
|
||||
private BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
|
||||
final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions();
|
||||
final Locale locale =
|
||||
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
|
||||
Locale.ROOT;
|
||||
final HighlightBuilder.BoundaryScannerType type =
|
||||
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
|
||||
HighlightBuilder.BoundaryScannerType.SENTENCE;
|
||||
int maxLen = fieldOptions.fragmentCharSize();
|
||||
switch (type) {
|
||||
case SENTENCE:
|
||||
if (maxLen > 0) {
|
||||
return BoundedBreakIteratorScanner.getSentence(locale, maxLen);
|
||||
}
|
||||
return BreakIterator.getSentenceInstance(locale);
|
||||
case WORD:
|
||||
// ignore maxLen
|
||||
return BreakIterator.getWordInstance(locale);
|
||||
default:
|
||||
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private static class HighlighterEntry {
|
||||
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
|
||||
}
|
||||
|
||||
static class MapperHighlighterEntry {
|
||||
private static class MapperHighlighterEntry {
|
||||
final CustomPassageFormatter passageFormatter;
|
||||
|
||||
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
|
||||
|
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||
|
||||
public class BoundedBreakIteratorScannerTests extends ESTestCase {
|
||||
private static final String[] WORD_BOUNDARIES =
|
||||
new String[] { " ", " ", "\t", "#", "\n" };
|
||||
private static final String[] SENTENCE_BOUNDARIES =
|
||||
new String[] { "! ", "? ", ". ", ".\n", ".\n\n" };
|
||||
|
||||
private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
|
||||
// Generate a random set of unique terms with ascii character
|
||||
int maxSize = randomIntBetween(5, 100);
|
||||
String[] vocabulary = new String[maxSize];
|
||||
for (int i = 0; i < maxSize; i++) {
|
||||
if (rarely()) {
|
||||
vocabulary[i] = randomAsciiOfLengthBetween(50, 200);
|
||||
} else {
|
||||
vocabulary[i] = randomAsciiOfLengthBetween(1, 30);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate a random text made of random terms separated with word-boundaries
|
||||
// and sentence-boundaries.
|
||||
StringBuilder text = new StringBuilder();
|
||||
List<Integer> offsetList = new ArrayList<> ();
|
||||
List<Integer> sizeList = new ArrayList<> ();
|
||||
// the number of sentences to generate
|
||||
int numSentences = randomIntBetween(10, 100);
|
||||
int maxTermLen = 0;
|
||||
for (int i = 0; i < numSentences; i++) {
|
||||
// the number of terms in the sentence
|
||||
int numTerms = randomIntBetween(5, 10);
|
||||
for (int j = 0; j < numTerms; j++) {
|
||||
int termId = randomIntBetween(0, vocabulary.length - 1);
|
||||
String term = vocabulary[termId].toLowerCase(Locale.ROOT);
|
||||
if (j == 0) {
|
||||
// capitalize the first letter of the first term in the sentence
|
||||
term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1);
|
||||
} else {
|
||||
String sep = randomFrom(WORD_BOUNDARIES);
|
||||
text.append(sep);
|
||||
}
|
||||
maxTermLen = Math.max(term.length(), maxTermLen);
|
||||
offsetList.add(text.length());
|
||||
sizeList.add(term.length());
|
||||
text.append(term);
|
||||
}
|
||||
String boundary = randomFrom(SENTENCE_BOUNDARIES);
|
||||
text.append(boundary);
|
||||
}
|
||||
|
||||
int[] sizes = sizeList.stream().mapToInt(i->i).toArray();
|
||||
int[] offsets = offsetList.stream().mapToInt(i->i).toArray();
|
||||
|
||||
bi.setText(text.toString());
|
||||
int currentPos = randomIntBetween(0, 20);
|
||||
int lastEnd = -1;
|
||||
int maxPassageLen = maxLen+(maxTermLen*2);
|
||||
while (currentPos < offsets.length) {
|
||||
// find the passage that contains the current term
|
||||
int nextOffset = offsets[currentPos];
|
||||
int start = bi.preceding(nextOffset+1);
|
||||
int end = bi.following(nextOffset);
|
||||
|
||||
// check that the passage is valid
|
||||
assertThat(start, greaterThanOrEqualTo(lastEnd));
|
||||
assertThat(end, greaterThan(start));
|
||||
assertThat(start, lessThanOrEqualTo(nextOffset));
|
||||
assertThat(end, greaterThanOrEqualTo(nextOffset));
|
||||
int passageLen = end-start;
|
||||
assertThat(passageLen, lessThanOrEqualTo(maxPassageLen));
|
||||
|
||||
// checks that the start and end of the passage are on word boundaries.
|
||||
int startPos = Arrays.binarySearch(offsets, start);
|
||||
int endPos = Arrays.binarySearch(offsets, end);
|
||||
if (startPos < 0) {
|
||||
int lastWordEnd =
|
||||
offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
|
||||
assertThat(start, greaterThanOrEqualTo(lastWordEnd));
|
||||
}
|
||||
if (endPos < 0) {
|
||||
if (Math.abs(endPos)-2 < offsets.length) {
|
||||
int lastWordEnd =
|
||||
offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
|
||||
assertThat(end, greaterThanOrEqualTo(lastWordEnd));
|
||||
}
|
||||
// advance the position to the end of the current passage
|
||||
currentPos = (Math.abs(endPos) - 1);
|
||||
} else {
|
||||
// advance the position to the end of the current passage
|
||||
currentPos = endPos;
|
||||
}
|
||||
// randomly advance to the next term to highlight
|
||||
currentPos += randomIntBetween(0, 20);
|
||||
lastEnd = end;
|
||||
}
|
||||
}
|
||||
|
||||
public void testBoundedSentence() {
|
||||
for (int i = 0; i < 20; i++) {
|
||||
int maxLen = randomIntBetween(10, 500);
|
||||
testRandomAsciiTextCase(
|
||||
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
|
||||
maxLen
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,20 +20,22 @@
|
|||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
@ -41,219 +43,167 @@ import org.apache.lucene.search.TopDocs;
|
|||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
|
||||
public class CustomUnifiedHighlighterTests extends ESTestCase {
|
||||
public void testCustomUnifiedHighlighter() throws Exception {
|
||||
private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query,
|
||||
Locale locale, BreakIterator breakIterator,
|
||||
int noMatchSize, String[] expectedPassages) throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
offsetsType.setStoreTermVectorOffsets(true);
|
||||
offsetsType.setStoreTermVectorPositions(true);
|
||||
offsetsType.setStoreTermVectors(true);
|
||||
|
||||
//good position but only one match
|
||||
final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter.";
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
ft.freeze();
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
body.setStringValue(firstValue);
|
||||
|
||||
//two matches, not the best snippet due to its length though
|
||||
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " +
|
||||
"that gets scored lower.";
|
||||
Field body2 = new Field("body", "", offsetsType);
|
||||
doc.add(body2);
|
||||
body2.setStringValue(secondValue);
|
||||
|
||||
//two matches and short, will be scored highest
|
||||
final String thirdValue = "This is highlighting the third short highlighting value.";
|
||||
Field body3 = new Field("body", "", offsetsType);
|
||||
doc.add(body3);
|
||||
body3.setStringValue(thirdValue);
|
||||
|
||||
//one match, same as first but at the end, will be scored lower due to its position
|
||||
final String fourthValue = "Just a test4 highlighting from unified highlighter.";
|
||||
Field body4 = new Field("body", "", offsetsType);
|
||||
doc.add(body4);
|
||||
body4.setStringValue(fourthValue);
|
||||
|
||||
for (String input : inputs) {
|
||||
Field field = new Field(fieldName, "", ft);
|
||||
field.setStringValue(input);
|
||||
doc.add(field);
|
||||
}
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
String firstHlValue = "Just a test1 <b>highlighting</b> from unified highlighter.";
|
||||
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
|
||||
" longer text that gets scored lower.";
|
||||
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
|
||||
String fourthHlValue = "Just a test4 <b>highlighting</b> from unified highlighter.";
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
|
||||
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue +
|
||||
HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
|
||||
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(),
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), null, fieldValue, true);
|
||||
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
|
||||
|
||||
assertThat(snippets.length, equalTo(4));
|
||||
|
||||
assertThat(snippets[0].getText(), equalTo(firstHlValue));
|
||||
assertThat(snippets[1].getText(), equalTo(secondHlValue));
|
||||
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
|
||||
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
|
||||
ir.close();
|
||||
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue,
|
||||
noMatchSize);
|
||||
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||
final Snippet[] snippets =
|
||||
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||
assertEquals(snippets.length, expectedPassages.length);
|
||||
for (int i = 0; i < snippets.length; i++) {
|
||||
assertEquals(snippets[i].getText(), expectedPassages[i]);
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
final String[] inputs = {
|
||||
"This is a test. Just a test1 highlighting from unified highlighter.",
|
||||
"This is the second highlighting value to perform highlighting on a longer text that gets scored lower.",
|
||||
"This is highlighting the third short highlighting value.",
|
||||
"Just a test4 highlighting from unified highlighter."
|
||||
};
|
||||
|
||||
String[] expectedPassages = {
|
||||
"Just a test1 <b>highlighting</b> from unified highlighter.",
|
||||
"This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
|
||||
" longer text that gets scored lower.",
|
||||
"This is <b>highlighting</b> the third short <b>highlighting</b> value.",
|
||||
"Just a test4 <b>highlighting</b> from unified highlighter."
|
||||
};
|
||||
Query query = new TermQuery(new Term("text", "highlighting"));
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages);
|
||||
}
|
||||
|
||||
public void testNoMatchSize() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
offsetsType.setStoreTermVectorOffsets(true);
|
||||
offsetsType.setStoreTermVectorPositions(true);
|
||||
offsetsType.setStoreTermVectors(true);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Field none = new Field("none", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
doc.add(none);
|
||||
|
||||
String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore.";
|
||||
body.setStringValue(firstValue);
|
||||
none.setStringValue(firstValue);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
Query query = new TermQuery(new Term("none", "highlighting"));
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter,
|
||||
null, firstValue, false);
|
||||
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
|
||||
assertThat(snippets.length, equalTo(0));
|
||||
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true);
|
||||
snippets = highlighter.highlightField("body", query, docId, 5);
|
||||
assertThat(snippets.length, equalTo(1));
|
||||
assertThat(snippets[0].getText(), equalTo("This is a test."));
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field textField = new Field(field, "", ft);
|
||||
Document doc = new Document();
|
||||
doc.add(textField);
|
||||
|
||||
textField.setStringValue(value);
|
||||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
return ir;
|
||||
final String[] inputs = {
|
||||
"This is a test. Just a test highlighting from unified. Feel free to ignore."
|
||||
};
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs);
|
||||
}
|
||||
|
||||
public void testMultiPhrasePrefixQuery() throws Exception {
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
Directory dir = newDirectory();
|
||||
String value = "The quick brown fox.";
|
||||
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
|
||||
final String[] inputs = {
|
||||
"The quick brown fox."
|
||||
};
|
||||
final String[] outputs = {
|
||||
"The <b>quick</b> <b>brown</b> <b>fox</b>."
|
||||
};
|
||||
MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
|
||||
query.add(new Term("text", "quick"));
|
||||
query.add(new Term("text", "brown"));
|
||||
query.add(new Term("text", "fo"));
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||
passageFormatter, null, value, false);
|
||||
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
|
||||
assertThat(snippets.length, equalTo(1));
|
||||
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
|
||||
ir.close();
|
||||
dir.close();
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||
}
|
||||
|
||||
public void testAllTermQuery() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
String value = "The quick brown fox.";
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexReader ir = indexOneDoc(dir, "all", value, analyzer);
|
||||
AllTermQuery query = new AllTermQuery(new Term("all", "fox"));
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||
passageFormatter, null, value, false);
|
||||
Snippet[] snippets = highlighter.highlightField("all", query, docId, 5);
|
||||
assertThat(snippets.length, equalTo(1));
|
||||
assertThat(snippets[0].getText(), equalTo("The quick brown <b>fox</b>."));
|
||||
ir.close();
|
||||
dir.close();
|
||||
public void testAllTermQuery() throws Exception {
|
||||
final String[] inputs = {
|
||||
"The quick brown fox."
|
||||
};
|
||||
final String[] outputs = {
|
||||
"The quick brown <b>fox</b>."
|
||||
};
|
||||
AllTermQuery query = new AllTermQuery(new Term("text", "fox"));
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||
}
|
||||
|
||||
public void testCommonTermsQuery() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
String value = "The quick brown fox.";
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
|
||||
public void testCommonTermsQuery() throws Exception {
|
||||
final String[] inputs = {
|
||||
"The quick brown fox."
|
||||
};
|
||||
final String[] outputs = {
|
||||
"The <b>quick</b> <b>brown</b> <b>fox</b>."
|
||||
};
|
||||
CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
|
||||
query.add(new Term("text", "quick"));
|
||||
query.add(new Term("text", "brown"));
|
||||
query.add(new Term("text", "fox"));
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||
passageFormatter, null, value, false);
|
||||
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
|
||||
assertThat(snippets.length, equalTo(1));
|
||||
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
|
||||
ir.close();
|
||||
dir.close();
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||
}
|
||||
|
||||
public void testSentenceBoundedBreakIterator() throws Exception {
|
||||
final String[] inputs = {
|
||||
"The quick brown fox in a long sentence with another quick brown fox. " +
|
||||
"Another sentence with brown fox."
|
||||
};
|
||||
final String[] outputs = {
|
||||
"The <b>quick</b> <b>brown</b>",
|
||||
"<b>fox</b> in a long",
|
||||
"with another <b>quick</b>",
|
||||
"<b>brown</b> <b>fox</b>.",
|
||||
"sentence with <b>brown</b>",
|
||||
"<b>fox</b>.",
|
||||
};
|
||||
BooleanQuery query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||
}
|
||||
|
||||
public void testRepeat() throws Exception {
|
||||
final String[] inputs = {
|
||||
"Fun fun fun fun fun fun fun fun fun fun"
|
||||
};
|
||||
final String[] outputs = {
|
||||
"<b>Fun</b> <b>fun</b> <b>fun</b>",
|
||||
"<b>fun</b> <b>fun</b>",
|
||||
"<b>fun</b> <b>fun</b> <b>fun</b>",
|
||||
"<b>fun</b> <b>fun</b>"
|
||||
};
|
||||
Query query = new TermQuery(new Term("text", "fun"));
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||
|
||||
query = new PhraseQuery.Builder()
|
||||
.add(new Term("text", "fun"))
|
||||
.add(new Term("text", "fun"))
|
||||
.build();
|
||||
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -751,52 +751,69 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
|
||||
}
|
||||
|
||||
public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
|
||||
public void testHighlighterWithSentenceBoundaryScanner() throws Exception {
|
||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||
ensureGreen();
|
||||
|
||||
indexRandom(true, client().prepareIndex("test", "type1")
|
||||
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
||||
|
||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
for (String type : new String[] {"unified", "fvh"}) {
|
||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(termQuery("field1", "sentence"))
|
||||
.highlighter(highlight()
|
||||
.field("field1", 20, 2)
|
||||
.order("score")
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.SENTENCE));
|
||||
.field("field1", 21, 2)
|
||||
.highlighterType(type)
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.SENTENCE));
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
|
||||
equalTo("A <xxx>sentence</xxx> with few words"),
|
||||
equalTo("A <xxx>sentence</xxx> with few words. ")
|
||||
));
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
|
||||
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
|
||||
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
|
||||
equalTo("Another <xxx>sentence</xxx> with"),
|
||||
equalTo("Another <xxx>sentence</xxx> with even more words. ")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
|
||||
public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
|
||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||
ensureGreen();
|
||||
|
||||
indexRandom(true, client().prepareIndex("test", "type1")
|
||||
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
||||
|
||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
for (String type : new String[] {"fvh", "unified"}) {
|
||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(termQuery("field1", "sentence"))
|
||||
.highlighter(highlight()
|
||||
.field("field1", 20, 2)
|
||||
.order("score")
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.SENTENCE)
|
||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||
.field("field1", 21, 2)
|
||||
.highlighterType(type)
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.SENTENCE)
|
||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
|
||||
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
|
||||
equalTo("A <xxx>sentence</xxx> with few words"),
|
||||
equalTo("A <xxx>sentence</xxx> with few words. ")
|
||||
));
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
|
||||
equalTo("Another <xxx>sentence</xxx> with"),
|
||||
equalTo("Another <xxx>sentence</xxx> with even more words. ")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
|
||||
public void testHighlighterWithWordBoundaryScanner() throws Exception {
|
||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||
ensureGreen();
|
||||
|
||||
|
@ -804,39 +821,48 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
||||
|
||||
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(termQuery("field1", "some"))
|
||||
.highlighter(highlight()
|
||||
.field("field1", 23, 1)
|
||||
.order("score")
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.WORD));
|
||||
for (String type : new String[] {"unified", "fvh"}) {
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(termQuery("field1", "some"))
|
||||
.highlighter(highlight()
|
||||
.field("field1", 23, 1)
|
||||
.highlighterType(type)
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.WORD));
|
||||
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
|
||||
equalTo("<xxx>some</xxx> quick and hairy brown"),
|
||||
equalTo("<xxx>some</xxx>")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
|
||||
public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
|
||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||
ensureGreen();
|
||||
|
||||
indexRandom(true, client().prepareIndex("test", "type1")
|
||||
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
||||
|
||||
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
|
||||
SearchSourceBuilder source = searchSource()
|
||||
for (String type : new String[] {"unified", "fvh"}) {
|
||||
SearchSourceBuilder source = searchSource()
|
||||
.query(termQuery("field1", "some"))
|
||||
.highlighter(highlight()
|
||||
.field("field1", 23, 1)
|
||||
.order("score")
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.WORD)
|
||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||
.field("field1", 23, 1)
|
||||
.highlighterType(type)
|
||||
.preTags("<xxx>").postTags("</xxx>")
|
||||
.boundaryScannerType(BoundaryScannerType.WORD)
|
||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
|
||||
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
|
||||
equalTo("<xxx>some</xxx> quick and hairy brown"),
|
||||
equalTo("<xxx>some</xxx>")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1841,16 +1867,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
// Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// We can also ask for a fragment longer than the input string and get the whole string
|
||||
field.highlighterType("plain").noMatchSize(text.length() * 2);
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
|
@ -1860,16 +1886,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||
|
||||
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// We can also ask for a fragment exactly the size of the input field and get the whole field
|
||||
field.highlighterType("plain").noMatchSize(text.length());
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
|
@ -1879,16 +1904,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||
|
||||
// unified hl returns the first sentence as the noMatchSize does not cross sentence boundary.
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||
|
||||
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// You can set noMatchSize globally in the highlighter as well
|
||||
field.highlighterType("plain").noMatchSize(null);
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||
|
@ -1898,12 +1923,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// We don't break if noMatchSize is less than zero though
|
||||
|
@ -1947,16 +1972,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||
|
||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||
|
||||
// And noMatchSize returns nothing when the first entry is empty string!
|
||||
index("test", "type1", "2", "text", new String[] {"", text2});
|
||||
refresh();
|
||||
|
@ -1980,11 +2004,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
.highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertNotHighlighted(response, 0, "text");
|
||||
|
||||
// except for the unified highlighter which starts from the first string with actual content
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test")
|
||||
.setQuery(idsQueryBuilder)
|
||||
.highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertNotHighlighted(response, 0, "text");
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am short"));
|
||||
|
||||
// But if the field was actually empty then you should get no highlighting field
|
||||
index("test", "type1", "3", "text", new String[] {});
|
||||
|
@ -2031,7 +2056,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
.highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertNotHighlighted(response, 0, "text");
|
||||
|
||||
field.highlighterType("fvh");
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test")
|
||||
.setQuery(idsQueryBuilder)
|
||||
.highlighter(new HighlightBuilder().field(field)).get();
|
||||
|
@ -2081,16 +2106,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
|
||||
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
|
||||
|
||||
|
||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("postings");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
|
||||
|
||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
||||
field.highlighterType("unified");
|
||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
|
||||
|
||||
//if there's a match we only return the values with matches (whole value as number_of_fragments == 0)
|
||||
MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth");
|
||||
field.highlighterType("plain");
|
||||
|
|
|
@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a
|
|||
* `highlight_query`
|
||||
* `pre_tags and `post_tags`
|
||||
* `require_field_match`
|
||||
* `boundary_scanner` (`sentence` (**default**) or `word`)
|
||||
* `max_fragment_length` (only for `sentence` scanner)
|
||||
* `no_match_size`
|
||||
|
||||
==== Force highlighter type
|
||||
|
||||
|
@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from.
|
|||
In the case where there is no matching fragment to highlight, the default is
|
||||
to not return anything. Instead, we can return a snippet of text from the
|
||||
beginning of the field by setting `no_match_size` (default `0`) to the length
|
||||
of the text that you want returned. The actual length may be shorter than
|
||||
of the text that you want returned. The actual length may be shorter or longer than
|
||||
specified as it tries to break on a word boundary. When using the postings
|
||||
highlighter it is not possible to control the actual size of the snippet,
|
||||
therefore the first sentence gets returned whenever `no_match_size` is
|
||||
|
@ -504,21 +507,26 @@ GET /_search
|
|||
[[boundary-scanners]]
|
||||
==== Boundary Scanners
|
||||
|
||||
When highlighting a field using the fast vector highlighter, you can specify
|
||||
how to break the highlighted fragments using `boundary_scanner`, which accepts
|
||||
When highlighting a field using the unified highlighter or the fast vector highlighter,
|
||||
you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts
|
||||
the following values:
|
||||
|
||||
* `chars` (default): allows to configure which characters (`boundary_chars`)
|
||||
* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`)
|
||||
constitute a boundary for highlighting. It's a single string with each boundary
|
||||
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
|
||||
the `boundary_max_scan` to control how far to look for boundary characters
|
||||
(defaults to `20`).
|
||||
(defaults to `20`). Works only with the Fast Vector Highlighter.
|
||||
|
||||
* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
|
||||
to break the highlighted fragments at the next _word_ or _sentence_ boundary.
|
||||
* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
|
||||
to break the highlighted fragments at the next _sentence_ or _word_ boundary.
|
||||
You can further specify `boundary_scanner_locale` to control which Locale is used
|
||||
to search the text for these boundaries.
|
||||
|
||||
[NOTE]
|
||||
When used with the `unified` highlighter, the `sentence` scanner splits sentence
|
||||
bigger than `fragment_size` at the first word boundary next to `fragment_size`.
|
||||
You can set `fragment_size` to 0 to never split any sentence.
|
||||
|
||||
[[matched-fields]]
|
||||
==== Matched Fields
|
||||
The Fast Vector Highlighter can combine matches on multiple fields to
|
||||
|
|
Loading…
Reference in New Issue