Add support for fragment_length in the unified highlighter (#23431)
* Add support for fragment_length in the unified highlighter This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter that is able to limit the size of fragments produced by generic break iterator like `sentence`. The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`. The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`. When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after** `fragment_length` is reached.
This commit is contained in:
parent
c462d7d486
commit
b8c352fc3f
|
@ -0,0 +1,171 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.text.CharacterIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A custom break iterator that scans text to find break-delimited passages bounded by
|
||||||
|
* a provided maximum length. This class delegates the boundary search to a first level
|
||||||
|
* break iterator. When this break iterator finds a passage greater than the maximum length
|
||||||
|
* a secondary break iterator is used to re-split the passage at the first boundary after
|
||||||
|
* maximum length.
|
||||||
|
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
|
||||||
|
* can create big outliers on semi-structured text.
|
||||||
|
*
|
||||||
|
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
|
||||||
|
**/
|
||||||
|
public class BoundedBreakIteratorScanner extends BreakIterator {
|
||||||
|
private final BreakIterator mainBreak;
|
||||||
|
private final BreakIterator innerBreak;
|
||||||
|
private final int maxLen;
|
||||||
|
|
||||||
|
private int lastPrecedingOffset = -1;
|
||||||
|
private int windowStart = -1;
|
||||||
|
private int windowEnd = -1;
|
||||||
|
private int innerStart = -1;
|
||||||
|
private int innerEnd = 0;
|
||||||
|
|
||||||
|
private BoundedBreakIteratorScanner(BreakIterator mainBreak,
|
||||||
|
BreakIterator innerBreak,
|
||||||
|
int maxLen) {
|
||||||
|
this.mainBreak = mainBreak;
|
||||||
|
this.innerBreak = innerBreak;
|
||||||
|
this.maxLen = maxLen;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CharacterIterator getText() {
|
||||||
|
return mainBreak.getText();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setText(CharacterIterator newText) {
|
||||||
|
reset();
|
||||||
|
mainBreak.setText(newText);
|
||||||
|
innerBreak.setText(newText);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setText(String newText) {
|
||||||
|
reset();
|
||||||
|
mainBreak.setText(newText);
|
||||||
|
innerBreak.setText(newText);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reset() {
|
||||||
|
lastPrecedingOffset = -1;
|
||||||
|
windowStart = -1;
|
||||||
|
windowEnd = -1;
|
||||||
|
innerStart = -1;
|
||||||
|
innerEnd = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int preceding(int offset) {
|
||||||
|
if (offset < lastPrecedingOffset) {
|
||||||
|
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
|
||||||
|
"usage doesn't look like UnifiedHighlighter");
|
||||||
|
}
|
||||||
|
if (offset > windowStart && offset < windowEnd) {
|
||||||
|
innerStart = innerEnd;
|
||||||
|
innerEnd = windowEnd;
|
||||||
|
} else {
|
||||||
|
windowStart = innerStart = mainBreak.preceding(offset);
|
||||||
|
windowEnd = innerEnd = mainBreak.following(offset-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (innerEnd - innerStart > maxLen) {
|
||||||
|
// the current split is too big,
|
||||||
|
// so starting from the current term we try to find boundaries on the left first
|
||||||
|
if (offset - maxLen > innerStart) {
|
||||||
|
innerStart = Math.max(innerStart,
|
||||||
|
innerBreak.preceding(offset - maxLen));
|
||||||
|
}
|
||||||
|
// and then we try to expand the passage to the right with the remaining size
|
||||||
|
int remaining = Math.max(0, maxLen - (offset - innerStart));
|
||||||
|
if (offset + remaining < windowEnd) {
|
||||||
|
innerEnd = Math.min(windowEnd,
|
||||||
|
innerBreak.following(offset + remaining));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastPrecedingOffset = offset - 1;
|
||||||
|
return innerStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Can be invoked only after a call to preceding(offset+1).
|
||||||
|
* See {@link FieldHighlighter} for usage.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int following(int offset) {
|
||||||
|
if (offset != lastPrecedingOffset || innerEnd == -1) {
|
||||||
|
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
|
||||||
|
"usage doesn't look like UnifiedHighlighter");
|
||||||
|
}
|
||||||
|
return innerEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
|
||||||
|
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
|
||||||
|
*/
|
||||||
|
public static BreakIterator getSentence(Locale locale, int maxLen) {
|
||||||
|
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
|
||||||
|
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
|
||||||
|
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int current() {
|
||||||
|
// Returns the last offset of the current split
|
||||||
|
return this.innerEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int first() {
|
||||||
|
throw new IllegalStateException("first() should not be called in this context");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int next() {
|
||||||
|
throw new IllegalStateException("next() should not be called in this context");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int last() {
|
||||||
|
throw new IllegalStateException("last() should not be called in this context");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int next(int n) {
|
||||||
|
throw new IllegalStateException("next(n) should not be called in this context");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int previous() {
|
||||||
|
throw new IllegalStateException("previous() should not be called in this context");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
|
||||||
|
* no highlights were found.
|
||||||
|
*/
|
||||||
|
class CustomFieldHighlighter extends FieldHighlighter {
|
||||||
|
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
|
||||||
|
|
||||||
|
private final Locale breakIteratorLocale;
|
||||||
|
private final int noMatchSize;
|
||||||
|
private final String fieldValue;
|
||||||
|
|
||||||
|
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
|
||||||
|
Locale breakIteratorLocale, BreakIterator breakIterator,
|
||||||
|
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
|
||||||
|
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
|
||||||
|
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
|
||||||
|
maxNoHighlightPassages, passageFormatter);
|
||||||
|
this.breakIteratorLocale = breakIteratorLocale;
|
||||||
|
this.noMatchSize = noMatchSize;
|
||||||
|
this.fieldValue = fieldValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
|
||||||
|
if (noMatchSize > 0) {
|
||||||
|
int pos = 0;
|
||||||
|
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
|
||||||
|
pos ++;
|
||||||
|
}
|
||||||
|
if (pos < fieldValue.length()) {
|
||||||
|
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
|
||||||
|
if (end == -1) {
|
||||||
|
end = fieldValue.length();
|
||||||
|
}
|
||||||
|
if (noMatchSize+pos < end) {
|
||||||
|
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
|
||||||
|
bi.setText(fieldValue);
|
||||||
|
// Finds the next word boundary **after** noMatchSize.
|
||||||
|
end = bi.following(noMatchSize + pos);
|
||||||
|
if (end == BreakIterator.DONE) {
|
||||||
|
end = fieldValue.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Passage passage = new Passage();
|
||||||
|
passage.setScore(Float.NaN);
|
||||||
|
passage.setStartOffset(pos);
|
||||||
|
passage.setEndOffset(end);
|
||||||
|
return new Passage[]{passage};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return EMPTY_PASSAGE;
|
||||||
|
}
|
||||||
|
}
|
|
@ -33,6 +33,8 @@ import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
import org.elasticsearch.common.Nullable;
|
import org.elasticsearch.common.Nullable;
|
||||||
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||||
|
@ -47,6 +49,7 @@ import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
|
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
|
||||||
|
@ -57,12 +60,14 @@ import java.util.Map;
|
||||||
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
|
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
|
||||||
*/
|
*/
|
||||||
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||||
|
public static final char MULTIVAL_SEP_CHAR = (char) 0;
|
||||||
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
|
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
|
||||||
|
|
||||||
private final String fieldValue;
|
private final String fieldValue;
|
||||||
private final PassageFormatter passageFormatter;
|
private final PassageFormatter passageFormatter;
|
||||||
private final BreakIterator breakIterator;
|
private final BreakIterator breakIterator;
|
||||||
private final boolean returnNonHighlightedSnippets;
|
private final Locale breakIteratorLocale;
|
||||||
|
private final int noMatchSize;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
||||||
|
@ -70,24 +75,26 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||||
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
|
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
|
||||||
* @param passageFormatter our own {@link CustomPassageFormatter}
|
* @param passageFormatter our own {@link CustomPassageFormatter}
|
||||||
* which generates snippets in forms of {@link Snippet} objects
|
* which generates snippets in forms of {@link Snippet} objects
|
||||||
|
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
|
||||||
|
* If null {@link Locale#ROOT} is used
|
||||||
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
|
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
|
||||||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||||
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
|
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
|
||||||
* the relevant stored field.
|
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
|
||||||
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
|
|
||||||
* returned rather than empty snippets when no highlighting can be performed
|
|
||||||
*/
|
*/
|
||||||
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
||||||
Analyzer analyzer,
|
Analyzer analyzer,
|
||||||
PassageFormatter passageFormatter,
|
PassageFormatter passageFormatter,
|
||||||
|
@Nullable Locale breakIteratorLocale,
|
||||||
@Nullable BreakIterator breakIterator,
|
@Nullable BreakIterator breakIterator,
|
||||||
String fieldValue,
|
String fieldValue,
|
||||||
boolean returnNonHighlightedSnippets) {
|
int noMatchSize) {
|
||||||
super(searcher, analyzer);
|
super(searcher, analyzer);
|
||||||
this.breakIterator = breakIterator;
|
this.breakIterator = breakIterator;
|
||||||
|
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
|
||||||
this.passageFormatter = passageFormatter;
|
this.passageFormatter = passageFormatter;
|
||||||
this.fieldValue = fieldValue;
|
this.fieldValue = fieldValue;
|
||||||
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
|
this.noMatchSize = noMatchSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -111,17 +118,14 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||||
@Override
|
@Override
|
||||||
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
||||||
int cacheCharsThreshold) throws IOException {
|
int cacheCharsThreshold) throws IOException {
|
||||||
//we only highlight one field, one document at a time
|
// we only highlight one field, one document at a time
|
||||||
return Collections.singletonList(new String[]{fieldValue});
|
return Collections.singletonList(new String[]{fieldValue});
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected BreakIterator getBreakIterator(String field) {
|
protected BreakIterator getBreakIterator(String field) {
|
||||||
if (breakIterator != null) {
|
|
||||||
return breakIterator;
|
return breakIterator;
|
||||||
}
|
}
|
||||||
return super.getBreakIterator(field);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PassageFormatter getFormatter(String field) {
|
protected PassageFormatter getFormatter(String field) {
|
||||||
|
@ -129,11 +133,18 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int getMaxNoHighlightPassages(String field) {
|
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
|
||||||
if (returnNonHighlightedSnippets) {
|
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
|
||||||
return 1;
|
Set<HighlightFlag> highlightFlags = getFlags(field);
|
||||||
}
|
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||||
return 0;
|
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
|
||||||
|
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
|
||||||
|
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
|
||||||
|
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
|
||||||
|
FieldOffsetStrategy strategy =
|
||||||
|
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
|
||||||
|
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
|
||||||
|
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -146,7 +157,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
||||||
return rewriteCustomQuery(query);
|
return rewriteCustomQuery(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Translate custom queries in queries that are supported by the unified highlighter.
|
* Translate custom queries in queries that are supported by the unified highlighter.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -52,13 +52,14 @@ import java.util.Map;
|
||||||
public class FastVectorHighlighter implements Highlighter {
|
public class FastVectorHighlighter implements Highlighter {
|
||||||
|
|
||||||
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
|
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
|
||||||
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
|
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
|
||||||
BreakIterator.getSentenceInstance(Locale.ROOT));
|
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||||
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
|
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER =
|
||||||
BreakIterator.getWordInstance(Locale.ROOT));
|
new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT));
|
||||||
|
|
||||||
|
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE =
|
||||||
|
Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope);
|
||||||
|
|
||||||
public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
|
|
||||||
true, Setting.Property.NodeScope);
|
|
||||||
private static final String CACHE_KEY = "highlight-fsv";
|
private static final String CACHE_KEY = "highlight-fsv";
|
||||||
private final Boolean termVectorMultiValue;
|
private final Boolean termVectorMultiValue;
|
||||||
|
|
||||||
|
@ -74,11 +75,12 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
FieldMapper mapper = highlighterContext.mapper;
|
FieldMapper mapper = highlighterContext.mapper;
|
||||||
|
|
||||||
if (canHighlight(mapper) == false) {
|
if (canHighlight(mapper) == false) {
|
||||||
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
|
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName +
|
||||||
+ "] should be indexed with term vector with position offsets to be used with fast vector highlighter");
|
"] should be indexed with term vector with position offsets to be used with fast vector highlighter");
|
||||||
}
|
}
|
||||||
|
|
||||||
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
Encoder encoder = field.fieldOptions().encoder().equals("html") ?
|
||||||
|
HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
||||||
|
|
||||||
if (!hitContext.cache().containsKey(CACHE_KEY)) {
|
if (!hitContext.cache().containsKey(CACHE_KEY)) {
|
||||||
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
|
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
|
||||||
|
@ -90,21 +92,21 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
if (field.fieldOptions().requireFieldMatch()) {
|
if (field.fieldOptions().requireFieldMatch()) {
|
||||||
if (cache.fieldMatchFieldQuery == null) {
|
if (cache.fieldMatchFieldQuery == null) {
|
||||||
/*
|
/*
|
||||||
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
|
* we use top level reader to rewrite the query against all readers,
|
||||||
* readers...)
|
* with use caching it across hits (and across readers...)
|
||||||
*/
|
*/
|
||||||
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
|
cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
|
||||||
true, field.fieldOptions().requireFieldMatch());
|
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
|
||||||
}
|
}
|
||||||
fieldQuery = cache.fieldMatchFieldQuery;
|
fieldQuery = cache.fieldMatchFieldQuery;
|
||||||
} else {
|
} else {
|
||||||
if (cache.noFieldMatchFieldQuery == null) {
|
if (cache.noFieldMatchFieldQuery == null) {
|
||||||
/*
|
/*
|
||||||
* we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
|
* we use top level reader to rewrite the query against all readers,
|
||||||
* readers...)
|
* with use caching it across hits (and across readers...)
|
||||||
*/
|
*/
|
||||||
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
|
cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
|
||||||
true, field.fieldOptions().requireFieldMatch());
|
hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
|
||||||
}
|
}
|
||||||
fieldQuery = cache.noFieldMatchFieldQuery;
|
fieldQuery = cache.noFieldMatchFieldQuery;
|
||||||
}
|
}
|
||||||
|
@ -142,7 +144,8 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(),
|
fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(),
|
||||||
field.fieldOptions().postTags(), boundaryScanner);
|
field.fieldOptions().postTags(), boundaryScanner);
|
||||||
} else {
|
} else {
|
||||||
fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
|
fragmentsBuilder =
|
||||||
|
new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
|
||||||
field.fieldOptions().postTags(), boundaryScanner);
|
field.fieldOptions().postTags(), boundaryScanner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -153,8 +156,8 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
entry.fragmentsBuilder = fragmentsBuilder;
|
entry.fragmentsBuilder = fragmentsBuilder;
|
||||||
if (cache.fvh == null) {
|
if (cache.fvh == null) {
|
||||||
// parameters to FVH are not requires since:
|
// parameters to FVH are not requires since:
|
||||||
// first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch)
|
// first two booleans are not relevant since they are set on the CustomFieldQuery
|
||||||
// fragment builders are used explicitly
|
// (phrase and fieldMatch) fragment builders are used explicitly
|
||||||
cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
|
cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
|
||||||
}
|
}
|
||||||
CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
|
CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
|
||||||
|
@ -172,13 +175,14 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
// we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
|
// we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
|
||||||
// Only send matched fields if they were requested to save time.
|
// Only send matched fields if they were requested to save time.
|
||||||
if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) {
|
if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) {
|
||||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
|
||||||
field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
|
mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize,
|
||||||
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
|
||||||
} else {
|
|
||||||
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
|
||||||
fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
|
|
||||||
field.fieldOptions().postTags(), encoder);
|
field.fieldOptions().postTags(), encoder);
|
||||||
|
} else {
|
||||||
|
fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
|
||||||
|
mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
|
||||||
|
entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fragments != null && fragments.length > 0) {
|
if (fragments != null && fragments.length > 0) {
|
||||||
|
@ -187,11 +191,13 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
|
|
||||||
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
|
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
|
||||||
if (noMatchSize > 0) {
|
if (noMatchSize > 0) {
|
||||||
// Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder
|
// Essentially we just request that a fragment is built from 0 to noMatchSize using
|
||||||
|
// the normal fragmentsBuilder
|
||||||
FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
|
FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
|
||||||
fieldFragList.add(0, noMatchSize, Collections.<WeightedPhraseInfo>emptyList());
|
fieldFragList.add(0, noMatchSize, Collections.<WeightedPhraseInfo>emptyList());
|
||||||
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
|
fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(),
|
||||||
fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
|
mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(),
|
||||||
|
field.fieldOptions().postTags(), encoder);
|
||||||
if (fragments != null && fragments.length > 0) {
|
if (fragments != null && fragments.length > 0) {
|
||||||
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
|
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
|
||||||
}
|
}
|
||||||
|
@ -200,7 +206,8 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
|
throw new FetchPhaseExecutionException(context,
|
||||||
|
"Failed to highlight field [" + highlighterContext.fieldName + "]", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -212,8 +219,13 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
|
|
||||||
private static BoundaryScanner getBoundaryScanner(Field field) {
|
private static BoundaryScanner getBoundaryScanner(Field field) {
|
||||||
final FieldOptions fieldOptions = field.fieldOptions();
|
final FieldOptions fieldOptions = field.fieldOptions();
|
||||||
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
|
final Locale boundaryScannerLocale =
|
||||||
switch(fieldOptions.boundaryScannerType()) {
|
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
|
||||||
|
Locale.ROOT;
|
||||||
|
final HighlightBuilder.BoundaryScannerType type =
|
||||||
|
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
|
||||||
|
HighlightBuilder.BoundaryScannerType.CHARS;
|
||||||
|
switch(type) {
|
||||||
case SENTENCE:
|
case SENTENCE:
|
||||||
if (boundaryScannerLocale != null) {
|
if (boundaryScannerLocale != null) {
|
||||||
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
|
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
|
||||||
|
@ -224,12 +236,14 @@ public class FastVectorHighlighter implements Highlighter {
|
||||||
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
|
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
|
||||||
}
|
}
|
||||||
return DEFAULT_WORD_BOUNDARY_SCANNER;
|
return DEFAULT_WORD_BOUNDARY_SCANNER;
|
||||||
default:
|
case CHARS:
|
||||||
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|
||||||
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
|
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
|
||||||
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
|
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
|
||||||
}
|
}
|
||||||
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
|
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
|
||||||
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
|
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
|
||||||
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
|
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
|
||||||
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
|
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
|
||||||
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
|
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
|
||||||
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
|
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
|
||||||
.boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
|
.boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.highlight.Encoder;
|
import org.apache.lucene.search.highlight.Encoder;
|
||||||
import org.apache.lucene.search.highlight.Snippet;
|
import org.apache.lucene.search.highlight.Snippet;
|
||||||
|
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
|
||||||
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
|
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
|
||||||
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
|
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -34,12 +35,15 @@ import org.elasticsearch.search.fetch.FetchSubPhase;
|
||||||
import org.elasticsearch.search.internal.SearchContext;
|
import org.elasticsearch.search.internal.SearchContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.text.BreakIterator;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
|
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
|
||||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
|
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
|
||||||
|
|
||||||
|
@ -93,19 +97,22 @@ public class UnifiedHighlighter implements Highlighter {
|
||||||
// we use a control char to separate values, which is the only char that the custom break iterator
|
// we use a control char to separate values, which is the only char that the custom break iterator
|
||||||
// breaks the text on, so we don't lose the distinction between the different values of a field and we
|
// breaks the text on, so we don't lose the distinction between the different values of a field and we
|
||||||
// get back a snippet per value
|
// get back a snippet per value
|
||||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
|
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||||
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
|
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
|
||||||
new org.apache.lucene.search.postingshighlight
|
new org.apache.lucene.search.postingshighlight
|
||||||
.CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
|
.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||||
highlighter =
|
highlighter =
|
||||||
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
|
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
|
||||||
breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
|
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue,
|
||||||
|
field.fieldOptions().noMatchSize());
|
||||||
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
|
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
|
||||||
} else {
|
} else {
|
||||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
|
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||||
|
BreakIterator bi = getBreakIterator(field);
|
||||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||||
mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0);
|
mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi,
|
||||||
|
fieldValue, field.fieldOptions().noMatchSize());
|
||||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||||
}
|
}
|
||||||
if (field.fieldOptions().requireFieldMatch()) {
|
if (field.fieldOptions().requireFieldMatch()) {
|
||||||
|
@ -144,11 +151,34 @@ public class UnifiedHighlighter implements Highlighter {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
static class HighlighterEntry {
|
private BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
|
||||||
|
final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions();
|
||||||
|
final Locale locale =
|
||||||
|
fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
|
||||||
|
Locale.ROOT;
|
||||||
|
final HighlightBuilder.BoundaryScannerType type =
|
||||||
|
fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() :
|
||||||
|
HighlightBuilder.BoundaryScannerType.SENTENCE;
|
||||||
|
int maxLen = fieldOptions.fragmentCharSize();
|
||||||
|
switch (type) {
|
||||||
|
case SENTENCE:
|
||||||
|
if (maxLen > 0) {
|
||||||
|
return BoundedBreakIteratorScanner.getSentence(locale, maxLen);
|
||||||
|
}
|
||||||
|
return BreakIterator.getSentenceInstance(locale);
|
||||||
|
case WORD:
|
||||||
|
// ignore maxLen
|
||||||
|
return BreakIterator.getWordInstance(locale);
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class HighlighterEntry {
|
||||||
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
|
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
static class MapperHighlighterEntry {
|
private static class MapperHighlighterEntry {
|
||||||
final CustomPassageFormatter passageFormatter;
|
final CustomPassageFormatter passageFormatter;
|
||||||
|
|
||||||
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
|
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
|
||||||
|
|
|
@ -0,0 +1,138 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
import static org.hamcrest.Matchers.greaterThan;
|
||||||
|
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||||
|
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||||
|
|
||||||
|
public class BoundedBreakIteratorScannerTests extends ESTestCase {
|
||||||
|
private static final String[] WORD_BOUNDARIES =
|
||||||
|
new String[] { " ", " ", "\t", "#", "\n" };
|
||||||
|
private static final String[] SENTENCE_BOUNDARIES =
|
||||||
|
new String[] { "! ", "? ", ". ", ".\n", ".\n\n" };
|
||||||
|
|
||||||
|
private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
|
||||||
|
// Generate a random set of unique terms with ascii character
|
||||||
|
int maxSize = randomIntBetween(5, 100);
|
||||||
|
String[] vocabulary = new String[maxSize];
|
||||||
|
for (int i = 0; i < maxSize; i++) {
|
||||||
|
if (rarely()) {
|
||||||
|
vocabulary[i] = randomAsciiOfLengthBetween(50, 200);
|
||||||
|
} else {
|
||||||
|
vocabulary[i] = randomAsciiOfLengthBetween(1, 30);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a random text made of random terms separated with word-boundaries
|
||||||
|
// and sentence-boundaries.
|
||||||
|
StringBuilder text = new StringBuilder();
|
||||||
|
List<Integer> offsetList = new ArrayList<> ();
|
||||||
|
List<Integer> sizeList = new ArrayList<> ();
|
||||||
|
// the number of sentences to generate
|
||||||
|
int numSentences = randomIntBetween(10, 100);
|
||||||
|
int maxTermLen = 0;
|
||||||
|
for (int i = 0; i < numSentences; i++) {
|
||||||
|
// the number of terms in the sentence
|
||||||
|
int numTerms = randomIntBetween(5, 10);
|
||||||
|
for (int j = 0; j < numTerms; j++) {
|
||||||
|
int termId = randomIntBetween(0, vocabulary.length - 1);
|
||||||
|
String term = vocabulary[termId].toLowerCase(Locale.ROOT);
|
||||||
|
if (j == 0) {
|
||||||
|
// capitalize the first letter of the first term in the sentence
|
||||||
|
term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1);
|
||||||
|
} else {
|
||||||
|
String sep = randomFrom(WORD_BOUNDARIES);
|
||||||
|
text.append(sep);
|
||||||
|
}
|
||||||
|
maxTermLen = Math.max(term.length(), maxTermLen);
|
||||||
|
offsetList.add(text.length());
|
||||||
|
sizeList.add(term.length());
|
||||||
|
text.append(term);
|
||||||
|
}
|
||||||
|
String boundary = randomFrom(SENTENCE_BOUNDARIES);
|
||||||
|
text.append(boundary);
|
||||||
|
}
|
||||||
|
|
||||||
|
int[] sizes = sizeList.stream().mapToInt(i->i).toArray();
|
||||||
|
int[] offsets = offsetList.stream().mapToInt(i->i).toArray();
|
||||||
|
|
||||||
|
bi.setText(text.toString());
|
||||||
|
int currentPos = randomIntBetween(0, 20);
|
||||||
|
int lastEnd = -1;
|
||||||
|
int maxPassageLen = maxLen+(maxTermLen*2);
|
||||||
|
while (currentPos < offsets.length) {
|
||||||
|
// find the passage that contains the current term
|
||||||
|
int nextOffset = offsets[currentPos];
|
||||||
|
int start = bi.preceding(nextOffset+1);
|
||||||
|
int end = bi.following(nextOffset);
|
||||||
|
|
||||||
|
// check that the passage is valid
|
||||||
|
assertThat(start, greaterThanOrEqualTo(lastEnd));
|
||||||
|
assertThat(end, greaterThan(start));
|
||||||
|
assertThat(start, lessThanOrEqualTo(nextOffset));
|
||||||
|
assertThat(end, greaterThanOrEqualTo(nextOffset));
|
||||||
|
int passageLen = end-start;
|
||||||
|
assertThat(passageLen, lessThanOrEqualTo(maxPassageLen));
|
||||||
|
|
||||||
|
// checks that the start and end of the passage are on word boundaries.
|
||||||
|
int startPos = Arrays.binarySearch(offsets, start);
|
||||||
|
int endPos = Arrays.binarySearch(offsets, end);
|
||||||
|
if (startPos < 0) {
|
||||||
|
int lastWordEnd =
|
||||||
|
offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
|
||||||
|
assertThat(start, greaterThanOrEqualTo(lastWordEnd));
|
||||||
|
}
|
||||||
|
if (endPos < 0) {
|
||||||
|
if (Math.abs(endPos)-2 < offsets.length) {
|
||||||
|
int lastWordEnd =
|
||||||
|
offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
|
||||||
|
assertThat(end, greaterThanOrEqualTo(lastWordEnd));
|
||||||
|
}
|
||||||
|
// advance the position to the end of the current passage
|
||||||
|
currentPos = (Math.abs(endPos) - 1);
|
||||||
|
} else {
|
||||||
|
// advance the position to the end of the current passage
|
||||||
|
currentPos = endPos;
|
||||||
|
}
|
||||||
|
// randomly advance to the next term to highlight
|
||||||
|
currentPos += randomIntBetween(0, 20);
|
||||||
|
lastEnd = end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBoundedSentence() {
|
||||||
|
for (int i = 0; i < 20; i++) {
|
||||||
|
int maxLen = randomIntBetween(10, 500);
|
||||||
|
testRandomAsciiTextCase(
|
||||||
|
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
|
||||||
|
maxLen
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,20 +20,22 @@
|
||||||
package org.apache.lucene.search.uhighlight;
|
package org.apache.lucene.search.uhighlight;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.queries.CommonTermsQuery;
|
import org.apache.lucene.queries.CommonTermsQuery;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
@ -41,219 +43,167 @@ import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||||
import org.apache.lucene.search.highlight.Snippet;
|
import org.apache.lucene.search.highlight.Snippet;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.elasticsearch.common.Strings;
|
||||||
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.text.BreakIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||||
import static org.hamcrest.CoreMatchers.equalTo;
|
import static org.hamcrest.CoreMatchers.equalTo;
|
||||||
|
|
||||||
public class CustomUnifiedHighlighterTests extends ESTestCase {
|
public class CustomUnifiedHighlighterTests extends ESTestCase {
|
||||||
public void testCustomUnifiedHighlighter() throws Exception {
|
private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query,
|
||||||
|
Locale locale, BreakIterator breakIterator,
|
||||||
|
int noMatchSize, String[] expectedPassages) throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
|
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
||||||
iwc.setMergePolicy(newLogMergePolicy());
|
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
ft.freeze();
|
||||||
offsetsType.setStoreTermVectorOffsets(true);
|
|
||||||
offsetsType.setStoreTermVectorPositions(true);
|
|
||||||
offsetsType.setStoreTermVectors(true);
|
|
||||||
|
|
||||||
//good position but only one match
|
|
||||||
final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter.";
|
|
||||||
Field body = new Field("body", "", offsetsType);
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(body);
|
for (String input : inputs) {
|
||||||
body.setStringValue(firstValue);
|
Field field = new Field(fieldName, "", ft);
|
||||||
|
field.setStringValue(input);
|
||||||
//two matches, not the best snippet due to its length though
|
doc.add(field);
|
||||||
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " +
|
}
|
||||||
"that gets scored lower.";
|
|
||||||
Field body2 = new Field("body", "", offsetsType);
|
|
||||||
doc.add(body2);
|
|
||||||
body2.setStringValue(secondValue);
|
|
||||||
|
|
||||||
//two matches and short, will be scored highest
|
|
||||||
final String thirdValue = "This is highlighting the third short highlighting value.";
|
|
||||||
Field body3 = new Field("body", "", offsetsType);
|
|
||||||
doc.add(body3);
|
|
||||||
body3.setStringValue(thirdValue);
|
|
||||||
|
|
||||||
//one match, same as first but at the end, will be scored lower due to its position
|
|
||||||
final String fourthValue = "Just a test4 highlighting from unified highlighter.";
|
|
||||||
Field body4 = new Field("body", "", offsetsType);
|
|
||||||
doc.add(body4);
|
|
||||||
body4.setStringValue(fourthValue);
|
|
||||||
|
|
||||||
iw.addDocument(doc);
|
iw.addDocument(doc);
|
||||||
|
DirectoryReader reader = iw.getReader();
|
||||||
IndexReader ir = iw.getReader();
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
iw.close();
|
iw.close();
|
||||||
|
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
|
||||||
String firstHlValue = "Just a test1 <b>highlighting</b> from unified highlighter.";
|
|
||||||
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
|
|
||||||
" longer text that gets scored lower.";
|
|
||||||
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
|
|
||||||
String fourthHlValue = "Just a test4 <b>highlighting</b> from unified highlighter.";
|
|
||||||
|
|
||||||
IndexSearcher searcher = newSearcher(ir);
|
|
||||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
|
||||||
|
|
||||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
|
||||||
assertThat(topDocs.totalHits, equalTo(1));
|
assertThat(topDocs.totalHits, equalTo(1));
|
||||||
|
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||||
int docId = topDocs.scoreDocs[0].doc;
|
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
||||||
|
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue,
|
||||||
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue +
|
noMatchSize);
|
||||||
HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
|
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||||
|
final Snippet[] snippets =
|
||||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(),
|
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), null, fieldValue, true);
|
assertEquals(snippets.length, expectedPassages.length);
|
||||||
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
|
for (int i = 0; i < snippets.length; i++) {
|
||||||
|
assertEquals(snippets[i].getText(), expectedPassages[i]);
|
||||||
assertThat(snippets.length, equalTo(4));
|
}
|
||||||
|
reader.close();
|
||||||
assertThat(snippets[0].getText(), equalTo(firstHlValue));
|
|
||||||
assertThat(snippets[1].getText(), equalTo(secondHlValue));
|
|
||||||
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
|
|
||||||
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
|
|
||||||
ir.close();
|
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSimple() throws Exception {
|
||||||
|
final String[] inputs = {
|
||||||
|
"This is a test. Just a test1 highlighting from unified highlighter.",
|
||||||
|
"This is the second highlighting value to perform highlighting on a longer text that gets scored lower.",
|
||||||
|
"This is highlighting the third short highlighting value.",
|
||||||
|
"Just a test4 highlighting from unified highlighter."
|
||||||
|
};
|
||||||
|
|
||||||
|
String[] expectedPassages = {
|
||||||
|
"Just a test1 <b>highlighting</b> from unified highlighter.",
|
||||||
|
"This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
|
||||||
|
" longer text that gets scored lower.",
|
||||||
|
"This is <b>highlighting</b> the third short <b>highlighting</b> value.",
|
||||||
|
"Just a test4 <b>highlighting</b> from unified highlighter."
|
||||||
|
};
|
||||||
|
Query query = new TermQuery(new Term("text", "highlighting"));
|
||||||
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
|
BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages);
|
||||||
|
}
|
||||||
|
|
||||||
public void testNoMatchSize() throws Exception {
|
public void testNoMatchSize() throws Exception {
|
||||||
Directory dir = newDirectory();
|
final String[] inputs = {
|
||||||
Analyzer analyzer = new StandardAnalyzer();
|
"This is a test. Just a test highlighting from unified. Feel free to ignore."
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
};
|
||||||
iwc.setMergePolicy(newLogMergePolicy());
|
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
|
BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs);
|
||||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
|
||||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
|
||||||
offsetsType.setStoreTermVectorOffsets(true);
|
|
||||||
offsetsType.setStoreTermVectorPositions(true);
|
|
||||||
offsetsType.setStoreTermVectors(true);
|
|
||||||
Field body = new Field("body", "", offsetsType);
|
|
||||||
Field none = new Field("none", "", offsetsType);
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(body);
|
|
||||||
doc.add(none);
|
|
||||||
|
|
||||||
String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore.";
|
|
||||||
body.setStringValue(firstValue);
|
|
||||||
none.setStringValue(firstValue);
|
|
||||||
iw.addDocument(doc);
|
|
||||||
|
|
||||||
IndexReader ir = iw.getReader();
|
|
||||||
iw.close();
|
|
||||||
|
|
||||||
Query query = new TermQuery(new Term("none", "highlighting"));
|
|
||||||
|
|
||||||
IndexSearcher searcher = newSearcher(ir);
|
|
||||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
|
||||||
assertThat(topDocs.totalHits, equalTo(1));
|
|
||||||
int docId = topDocs.scoreDocs[0].doc;
|
|
||||||
|
|
||||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
|
||||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter,
|
|
||||||
null, firstValue, false);
|
|
||||||
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
|
|
||||||
assertThat(snippets.length, equalTo(0));
|
|
||||||
|
|
||||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true);
|
|
||||||
snippets = highlighter.highlightField("body", query, docId, 5);
|
|
||||||
assertThat(snippets.length, equalTo(1));
|
|
||||||
assertThat(snippets[0].getText(), equalTo("This is a test."));
|
|
||||||
ir.close();
|
|
||||||
dir.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException {
|
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
|
|
||||||
iwc.setMergePolicy(newLogMergePolicy());
|
|
||||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
|
||||||
|
|
||||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
|
||||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
|
||||||
Field textField = new Field(field, "", ft);
|
|
||||||
Document doc = new Document();
|
|
||||||
doc.add(textField);
|
|
||||||
|
|
||||||
textField.setStringValue(value);
|
|
||||||
iw.addDocument(doc);
|
|
||||||
IndexReader ir = iw.getReader();
|
|
||||||
iw.close();
|
|
||||||
return ir;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMultiPhrasePrefixQuery() throws Exception {
|
public void testMultiPhrasePrefixQuery() throws Exception {
|
||||||
Analyzer analyzer = new StandardAnalyzer();
|
final String[] inputs = {
|
||||||
Directory dir = newDirectory();
|
"The quick brown fox."
|
||||||
String value = "The quick brown fox.";
|
};
|
||||||
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
|
final String[] outputs = {
|
||||||
|
"The <b>quick</b> <b>brown</b> <b>fox</b>."
|
||||||
|
};
|
||||||
MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
|
MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
|
||||||
query.add(new Term("text", "quick"));
|
query.add(new Term("text", "quick"));
|
||||||
query.add(new Term("text", "brown"));
|
query.add(new Term("text", "brown"));
|
||||||
query.add(new Term("text", "fo"));
|
query.add(new Term("text", "fo"));
|
||||||
IndexSearcher searcher = newSearcher(ir);
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||||
assertThat(topDocs.totalHits, equalTo(1));
|
|
||||||
int docId = topDocs.scoreDocs[0].doc;
|
|
||||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
|
||||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
|
||||||
passageFormatter, null, value, false);
|
|
||||||
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
|
|
||||||
assertThat(snippets.length, equalTo(1));
|
|
||||||
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
|
|
||||||
ir.close();
|
|
||||||
dir.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAllTermQuery() throws IOException {
|
public void testAllTermQuery() throws Exception {
|
||||||
Directory dir = newDirectory();
|
final String[] inputs = {
|
||||||
String value = "The quick brown fox.";
|
"The quick brown fox."
|
||||||
Analyzer analyzer = new StandardAnalyzer();
|
};
|
||||||
IndexReader ir = indexOneDoc(dir, "all", value, analyzer);
|
final String[] outputs = {
|
||||||
AllTermQuery query = new AllTermQuery(new Term("all", "fox"));
|
"The quick brown <b>fox</b>."
|
||||||
IndexSearcher searcher = newSearcher(ir);
|
};
|
||||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
AllTermQuery query = new AllTermQuery(new Term("text", "fox"));
|
||||||
assertThat(topDocs.totalHits, equalTo(1));
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
int docId = topDocs.scoreDocs[0].doc;
|
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
|
||||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
|
||||||
passageFormatter, null, value, false);
|
|
||||||
Snippet[] snippets = highlighter.highlightField("all", query, docId, 5);
|
|
||||||
assertThat(snippets.length, equalTo(1));
|
|
||||||
assertThat(snippets[0].getText(), equalTo("The quick brown <b>fox</b>."));
|
|
||||||
ir.close();
|
|
||||||
dir.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommonTermsQuery() throws IOException {
|
public void testCommonTermsQuery() throws Exception {
|
||||||
Directory dir = newDirectory();
|
final String[] inputs = {
|
||||||
String value = "The quick brown fox.";
|
"The quick brown fox."
|
||||||
Analyzer analyzer = new StandardAnalyzer();
|
};
|
||||||
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
|
final String[] outputs = {
|
||||||
|
"The <b>quick</b> <b>brown</b> <b>fox</b>."
|
||||||
|
};
|
||||||
CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
|
CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
|
||||||
query.add(new Term("text", "quick"));
|
query.add(new Term("text", "quick"));
|
||||||
query.add(new Term("text", "brown"));
|
query.add(new Term("text", "brown"));
|
||||||
query.add(new Term("text", "fox"));
|
query.add(new Term("text", "fox"));
|
||||||
IndexSearcher searcher = newSearcher(ir);
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||||
assertThat(topDocs.totalHits, equalTo(1));
|
}
|
||||||
int docId = topDocs.scoreDocs[0].doc;
|
|
||||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
public void testSentenceBoundedBreakIterator() throws Exception {
|
||||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
|
final String[] inputs = {
|
||||||
passageFormatter, null, value, false);
|
"The quick brown fox in a long sentence with another quick brown fox. " +
|
||||||
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
|
"Another sentence with brown fox."
|
||||||
assertThat(snippets.length, equalTo(1));
|
};
|
||||||
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
|
final String[] outputs = {
|
||||||
ir.close();
|
"The <b>quick</b> <b>brown</b>",
|
||||||
dir.close();
|
"<b>fox</b> in a long",
|
||||||
|
"with another <b>quick</b>",
|
||||||
|
"<b>brown</b> <b>fox</b>.",
|
||||||
|
"sentence with <b>brown</b>",
|
||||||
|
"<b>fox</b>.",
|
||||||
|
};
|
||||||
|
BooleanQuery query = new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
|
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRepeat() throws Exception {
|
||||||
|
final String[] inputs = {
|
||||||
|
"Fun fun fun fun fun fun fun fun fun fun"
|
||||||
|
};
|
||||||
|
final String[] outputs = {
|
||||||
|
"<b>Fun</b> <b>fun</b> <b>fun</b>",
|
||||||
|
"<b>fun</b> <b>fun</b>",
|
||||||
|
"<b>fun</b> <b>fun</b> <b>fun</b>",
|
||||||
|
"<b>fun</b> <b>fun</b>"
|
||||||
|
};
|
||||||
|
Query query = new TermQuery(new Term("text", "fun"));
|
||||||
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
|
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||||
|
|
||||||
|
query = new PhraseQuery.Builder()
|
||||||
|
.add(new Term("text", "fun"))
|
||||||
|
.add(new Term("text", "fun"))
|
||||||
|
.build();
|
||||||
|
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
|
||||||
|
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -751,52 +751,69 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
|
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
|
public void testHighlighterWithSentenceBoundaryScanner() throws Exception {
|
||||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||||
ensureGreen();
|
ensureGreen();
|
||||||
|
|
||||||
indexRandom(true, client().prepareIndex("test", "type1")
|
indexRandom(true, client().prepareIndex("test", "type1")
|
||||||
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
||||||
|
|
||||||
|
for (String type : new String[] {"unified", "fvh"}) {
|
||||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||||
SearchSourceBuilder source = searchSource()
|
SearchSourceBuilder source = searchSource()
|
||||||
.query(termQuery("field1", "sentence"))
|
.query(termQuery("field1", "sentence"))
|
||||||
.highlighter(highlight()
|
.highlighter(highlight()
|
||||||
.field("field1", 20, 2)
|
.field("field1", 21, 2)
|
||||||
.order("score")
|
.highlighterType(type)
|
||||||
.preTags("<xxx>").postTags("</xxx>")
|
.preTags("<xxx>").postTags("</xxx>")
|
||||||
.boundaryScannerType(BoundaryScannerType.SENTENCE));
|
.boundaryScannerType(BoundaryScannerType.SENTENCE));
|
||||||
|
|
||||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||||
|
|
||||||
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
|
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
|
||||||
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
|
equalTo("A <xxx>sentence</xxx> with few words"),
|
||||||
|
equalTo("A <xxx>sentence</xxx> with few words. ")
|
||||||
|
));
|
||||||
|
|
||||||
|
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
|
||||||
|
equalTo("Another <xxx>sentence</xxx> with"),
|
||||||
|
equalTo("Another <xxx>sentence</xxx> with even more words. ")
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
|
public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
|
||||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||||
ensureGreen();
|
ensureGreen();
|
||||||
|
|
||||||
indexRandom(true, client().prepareIndex("test", "type1")
|
indexRandom(true, client().prepareIndex("test", "type1")
|
||||||
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
.setSource("field1", "A sentence with few words. Another sentence with even more words."));
|
||||||
|
|
||||||
|
for (String type : new String[] {"fvh", "unified"}) {
|
||||||
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
|
||||||
SearchSourceBuilder source = searchSource()
|
SearchSourceBuilder source = searchSource()
|
||||||
.query(termQuery("field1", "sentence"))
|
.query(termQuery("field1", "sentence"))
|
||||||
.highlighter(highlight()
|
.highlighter(highlight()
|
||||||
.field("field1", 20, 2)
|
.field("field1", 21, 2)
|
||||||
.order("score")
|
.highlighterType(type)
|
||||||
.preTags("<xxx>").postTags("</xxx>")
|
.preTags("<xxx>").postTags("</xxx>")
|
||||||
.boundaryScannerType(BoundaryScannerType.SENTENCE)
|
.boundaryScannerType(BoundaryScannerType.SENTENCE)
|
||||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||||
|
|
||||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||||
|
|
||||||
assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
|
assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
|
||||||
assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
|
equalTo("A <xxx>sentence</xxx> with few words"),
|
||||||
|
equalTo("A <xxx>sentence</xxx> with few words. ")
|
||||||
|
));
|
||||||
|
|
||||||
|
assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
|
||||||
|
equalTo("Another <xxx>sentence</xxx> with"),
|
||||||
|
equalTo("Another <xxx>sentence</xxx> with even more words. ")
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
|
public void testHighlighterWithWordBoundaryScanner() throws Exception {
|
||||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||||
ensureGreen();
|
ensureGreen();
|
||||||
|
|
||||||
|
@ -804,39 +821,48 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
||||||
|
|
||||||
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
|
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
|
||||||
|
for (String type : new String[] {"unified", "fvh"}) {
|
||||||
SearchSourceBuilder source = searchSource()
|
SearchSourceBuilder source = searchSource()
|
||||||
.query(termQuery("field1", "some"))
|
.query(termQuery("field1", "some"))
|
||||||
.highlighter(highlight()
|
.highlighter(highlight()
|
||||||
.field("field1", 23, 1)
|
.field("field1", 23, 1)
|
||||||
.order("score")
|
.highlighterType(type)
|
||||||
.preTags("<xxx>").postTags("</xxx>")
|
.preTags("<xxx>").postTags("</xxx>")
|
||||||
.boundaryScannerType(BoundaryScannerType.WORD));
|
.boundaryScannerType(BoundaryScannerType.WORD));
|
||||||
|
|
||||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||||
|
|
||||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
|
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
|
||||||
|
equalTo("<xxx>some</xxx> quick and hairy brown"),
|
||||||
|
equalTo("<xxx>some</xxx>")
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
|
public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
|
||||||
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
|
||||||
ensureGreen();
|
ensureGreen();
|
||||||
|
|
||||||
indexRandom(true, client().prepareIndex("test", "type1")
|
indexRandom(true, client().prepareIndex("test", "type1")
|
||||||
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
.setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
|
||||||
|
|
||||||
logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
|
for (String type : new String[] {"unified", "fvh"}) {
|
||||||
SearchSourceBuilder source = searchSource()
|
SearchSourceBuilder source = searchSource()
|
||||||
.query(termQuery("field1", "some"))
|
.query(termQuery("field1", "some"))
|
||||||
.highlighter(highlight()
|
.highlighter(highlight()
|
||||||
.field("field1", 23, 1)
|
.field("field1", 23, 1)
|
||||||
.order("score")
|
.highlighterType(type)
|
||||||
.preTags("<xxx>").postTags("</xxx>")
|
.preTags("<xxx>").postTags("</xxx>")
|
||||||
.boundaryScannerType(BoundaryScannerType.WORD)
|
.boundaryScannerType(BoundaryScannerType.WORD)
|
||||||
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
.boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
|
||||||
|
|
||||||
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
|
||||||
|
|
||||||
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
|
assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
|
||||||
|
equalTo("<xxx>some</xxx> quick and hairy brown"),
|
||||||
|
equalTo("<xxx>some</xxx>")
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1841,16 +1867,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
|
// Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word
|
||||||
|
field.highlighterType("unified");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||||
field.highlighterType("postings");
|
field.highlighterType("postings");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||||
|
|
||||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
|
||||||
field.highlighterType("unified");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
|
||||||
|
|
||||||
// We can also ask for a fragment longer than the input string and get the whole string
|
// We can also ask for a fragment longer than the input string and get the whole string
|
||||||
field.highlighterType("plain").noMatchSize(text.length() * 2);
|
field.highlighterType("plain").noMatchSize(text.length() * 2);
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
@ -1860,16 +1886,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||||
|
|
||||||
|
field.highlighterType("unified");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||||
|
|
||||||
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||||
field.highlighterType("postings");
|
field.highlighterType("postings");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||||
|
|
||||||
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
|
|
||||||
field.highlighterType("unified");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
|
||||||
|
|
||||||
// We can also ask for a fragment exactly the size of the input field and get the whole field
|
// We can also ask for a fragment exactly the size of the input field and get the whole field
|
||||||
field.highlighterType("plain").noMatchSize(text.length());
|
field.highlighterType("plain").noMatchSize(text.length());
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
@ -1879,16 +1904,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||||
|
|
||||||
|
// unified hl returns the first sentence as the noMatchSize does not cross sentence boundary.
|
||||||
|
field.highlighterType("unified");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo(text));
|
||||||
|
|
||||||
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
//no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
|
||||||
field.highlighterType("postings");
|
field.highlighterType("postings");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||||
|
|
||||||
//no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
|
|
||||||
field.highlighterType("unified");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
|
||||||
|
|
||||||
// You can set noMatchSize globally in the highlighter as well
|
// You can set noMatchSize globally in the highlighter as well
|
||||||
field.highlighterType("plain").noMatchSize(null);
|
field.highlighterType("plain").noMatchSize(null);
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||||
|
@ -1898,12 +1923,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
field.highlighterType("postings");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
|
||||||
|
|
||||||
field.highlighterType("unified");
|
field.highlighterType("unified");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
|
field.highlighterType("postings");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||||
|
|
||||||
// We don't break if noMatchSize is less than zero though
|
// We don't break if noMatchSize is less than zero though
|
||||||
|
@ -1947,16 +1972,15 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
|
field.highlighterType("unified");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
|
||||||
|
|
||||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||||
field.highlighterType("postings");
|
field.highlighterType("postings");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
||||||
|
|
||||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
|
||||||
field.highlighterType("unified");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
|
|
||||||
|
|
||||||
// And noMatchSize returns nothing when the first entry is empty string!
|
// And noMatchSize returns nothing when the first entry is empty string!
|
||||||
index("test", "type1", "2", "text", new String[] {"", text2});
|
index("test", "type1", "2", "text", new String[] {"", text2});
|
||||||
refresh();
|
refresh();
|
||||||
|
@ -1980,11 +2004,12 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
.highlighter(new HighlightBuilder().field(field)).get();
|
.highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertNotHighlighted(response, 0, "text");
|
assertNotHighlighted(response, 0, "text");
|
||||||
|
|
||||||
|
// except for the unified highlighter which starts from the first string with actual content
|
||||||
field.highlighterType("unified");
|
field.highlighterType("unified");
|
||||||
response = client().prepareSearch("test")
|
response = client().prepareSearch("test")
|
||||||
.setQuery(idsQueryBuilder)
|
.setQuery(idsQueryBuilder)
|
||||||
.highlighter(new HighlightBuilder().field(field)).get();
|
.highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertNotHighlighted(response, 0, "text");
|
assertHighlight(response, 0, "text", 0, 1, equalTo("I am short"));
|
||||||
|
|
||||||
// But if the field was actually empty then you should get no highlighting field
|
// But if the field was actually empty then you should get no highlighting field
|
||||||
index("test", "type1", "3", "text", new String[] {});
|
index("test", "type1", "3", "text", new String[] {});
|
||||||
|
@ -2031,7 +2056,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
.highlighter(new HighlightBuilder().field(field)).get();
|
.highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertNotHighlighted(response, 0, "text");
|
assertNotHighlighted(response, 0, "text");
|
||||||
|
|
||||||
field.highlighterType("fvh");
|
field.highlighterType("unified");
|
||||||
response = client().prepareSearch("test")
|
response = client().prepareSearch("test")
|
||||||
.setQuery(idsQueryBuilder)
|
.setQuery(idsQueryBuilder)
|
||||||
.highlighter(new HighlightBuilder().field(field)).get();
|
.highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
@ -2081,16 +2106,16 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
|
||||||
|
|
||||||
|
field.highlighterType("unified");
|
||||||
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
|
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
|
||||||
|
|
||||||
|
|
||||||
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
// Postings hl also works but the fragment is the whole first sentence (size ignored)
|
||||||
field.highlighterType("postings");
|
field.highlighterType("postings");
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
|
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
|
||||||
|
|
||||||
// Unified hl also works but the fragment is the whole first sentence (size ignored)
|
|
||||||
field.highlighterType("unified");
|
|
||||||
response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
|
|
||||||
assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
|
|
||||||
|
|
||||||
//if there's a match we only return the values with matches (whole value as number_of_fragments == 0)
|
//if there's a match we only return the values with matches (whole value as number_of_fragments == 0)
|
||||||
MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth");
|
MatchQueryBuilder queryBuilder = QueryBuilders.matchQuery("text", "third fifth");
|
||||||
field.highlighterType("plain");
|
field.highlighterType("plain");
|
||||||
|
|
|
@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a
|
||||||
* `highlight_query`
|
* `highlight_query`
|
||||||
* `pre_tags and `post_tags`
|
* `pre_tags and `post_tags`
|
||||||
* `require_field_match`
|
* `require_field_match`
|
||||||
|
* `boundary_scanner` (`sentence` (**default**) or `word`)
|
||||||
|
* `max_fragment_length` (only for `sentence` scanner)
|
||||||
|
* `no_match_size`
|
||||||
|
|
||||||
==== Force highlighter type
|
==== Force highlighter type
|
||||||
|
|
||||||
|
@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from.
|
||||||
In the case where there is no matching fragment to highlight, the default is
|
In the case where there is no matching fragment to highlight, the default is
|
||||||
to not return anything. Instead, we can return a snippet of text from the
|
to not return anything. Instead, we can return a snippet of text from the
|
||||||
beginning of the field by setting `no_match_size` (default `0`) to the length
|
beginning of the field by setting `no_match_size` (default `0`) to the length
|
||||||
of the text that you want returned. The actual length may be shorter than
|
of the text that you want returned. The actual length may be shorter or longer than
|
||||||
specified as it tries to break on a word boundary. When using the postings
|
specified as it tries to break on a word boundary. When using the postings
|
||||||
highlighter it is not possible to control the actual size of the snippet,
|
highlighter it is not possible to control the actual size of the snippet,
|
||||||
therefore the first sentence gets returned whenever `no_match_size` is
|
therefore the first sentence gets returned whenever `no_match_size` is
|
||||||
|
@ -504,21 +507,26 @@ GET /_search
|
||||||
[[boundary-scanners]]
|
[[boundary-scanners]]
|
||||||
==== Boundary Scanners
|
==== Boundary Scanners
|
||||||
|
|
||||||
When highlighting a field using the fast vector highlighter, you can specify
|
When highlighting a field using the unified highlighter or the fast vector highlighter,
|
||||||
how to break the highlighted fragments using `boundary_scanner`, which accepts
|
you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts
|
||||||
the following values:
|
the following values:
|
||||||
|
|
||||||
* `chars` (default): allows to configure which characters (`boundary_chars`)
|
* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`)
|
||||||
constitute a boundary for highlighting. It's a single string with each boundary
|
constitute a boundary for highlighting. It's a single string with each boundary
|
||||||
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
|
character defined in it (defaults to `.,!? \t\n`). It also allows configuring
|
||||||
the `boundary_max_scan` to control how far to look for boundary characters
|
the `boundary_max_scan` to control how far to look for boundary characters
|
||||||
(defaults to `20`).
|
(defaults to `20`). Works only with the Fast Vector Highlighter.
|
||||||
|
|
||||||
* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
|
* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
|
||||||
to break the highlighted fragments at the next _word_ or _sentence_ boundary.
|
to break the highlighted fragments at the next _sentence_ or _word_ boundary.
|
||||||
You can further specify `boundary_scanner_locale` to control which Locale is used
|
You can further specify `boundary_scanner_locale` to control which Locale is used
|
||||||
to search the text for these boundaries.
|
to search the text for these boundaries.
|
||||||
|
|
||||||
|
[NOTE]
|
||||||
|
When used with the `unified` highlighter, the `sentence` scanner splits sentence
|
||||||
|
bigger than `fragment_size` at the first word boundary next to `fragment_size`.
|
||||||
|
You can set `fragment_size` to 0 to never split any sentence.
|
||||||
|
|
||||||
[[matched-fields]]
|
[[matched-fields]]
|
||||||
==== Matched Fields
|
==== Matched Fields
|
||||||
The Fast Vector Highlighter can combine matches on multiple fields to
|
The Fast Vector Highlighter can combine matches on multiple fields to
|
||||||
|
|
Loading…
Reference in New Issue