Remove the postings highlighter and make unified the default highlighter choice (#25028)

This change removes the `postings` highlighter. This highlighter has been removed from Lucene master (7.x) because it behaves
exactly like the `unified` highlighter when index_options is set to `offsets`:
https://issues.apache.org/jira/browse/LUCENE-7815

It also makes the `unified` highlighter the default choice for highlighting a field (if `type` is not provided).
The strategy used internally by this highlighter remain the same as before, it checks `term_vectors` first, then `postings` and ultimately it re-analyzes the text.
Ultimately it rewrites the docs so that the options that the `unified` highlighter cannot handle are clearly marked as such.
There are few features that the `unified` highlighter is not able to handle which is why the other highlighters (`plain` and `fvh`) are still available.
I'll open separate issues for these features and we'll deprecate the `fvh` and `plain` highlighters when full support for these features have been added to the `unified`.
This commit is contained in:
Jim Ferenczi 2017-06-09 14:09:57 +02:00 committed by GitHub
parent eca4f24b16
commit 8250aa4267
28 changed files with 640 additions and 1633 deletions

View File

@ -16,7 +16,6 @@
<!-- Hopefully temporary suppression of LineLength on files that don't pass it. We should remove these when we the
files start to pass. -->
<suppress files="client[/\\]rest[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]client[/\\]HeapBufferedAsyncResponseConsumerTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]postingshighlight[/\\]CustomPostingsHighlighter.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]vectorhighlight[/\\]CustomFieldQuery.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]Action.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]ActionRequestBuilder.java" checks="LineLength" />
@ -428,7 +427,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]threadpool[/\\]ThreadPool.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]tribe[/\\]TribeService.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]queries[/\\]BlendedTermQueryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]postingshighlight[/\\]CustomPostingsHighlighterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]VersionTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]RejectionActionIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]admin[/\\]HotThreadsIT.java" checks="LineLength" />

View File

@ -1,82 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.Encoder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
/**
Custom passage formatter that allows us to:
1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet})
2) use the {@link Encoder} implementations that are already used with the other highlighters
*/
public class CustomPassageFormatter extends PassageFormatter {
private final String preTag;
private final String postTag;
private final Encoder encoder;
public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) {
this.preTag = preTag;
this.postTag = postTag;
this.encoder = encoder;
}
@Override
public Snippet[] format(Passage[] passages, String content) {
Snippet[] snippets = new Snippet[passages.length];
int pos;
for (int j = 0; j < passages.length; j++) {
Passage passage = passages[j];
StringBuilder sb = new StringBuilder();
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
sb.deleteCharAt(sb.length() - 1);
} else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
sb.deleteCharAt(sb.length() - 1);
}
//and we trim the snippets too
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
}
return snippets;
}
protected void append(StringBuilder dest, String content, int start, int end) {
dest.append(encoder.encodeText(content.substring(start, end)));
}
}

View File

@ -1,138 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Snippet;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Map;
/**
* Subclass of the {@link PostingsHighlighter} that works for a single field in a single document.
* Uses a custom {@link PassageFormatter}. Accepts field content as a constructor argument, given that loading
* is custom and can be done reading from _source field. Supports using different {@link BreakIterator} to break
* the text into fragments. Considers every distinct field value as a discrete passage for highlighting (unless
* the whole content needs to be highlighted). Supports both returning empty snippets and non highlighted snippets
* when no highlighting can be performed.
*
* The use that we make of the postings highlighter is not optimal. It would be much better to highlight
* multiple docs in a single call, as we actually lose its sequential IO. That would require to
* refactor the elasticsearch highlight api which currently works per hit.
*/
public final class CustomPostingsHighlighter extends PostingsHighlighter {
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
private final Analyzer analyzer;
private final CustomPassageFormatter passageFormatter;
private final BreakIterator breakIterator;
private final boolean returnNonHighlightedSnippets;
private final String fieldValue;
/**
* Creates a new instance of {@link CustomPostingsHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
* @param fieldValue the original field values as constructor argument, loaded from te _source field or the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
* no highlighting can be performed
*/
public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, String fieldValue, boolean returnNonHighlightedSnippets) {
this(analyzer, passageFormatter, null, fieldValue, returnNonHighlightedSnippets);
}
/**
* Creates a new instance of {@link CustomPostingsHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
* @param breakIterator an instance {@link BreakIterator} selected depending on the highlighting options
* @param fieldValue the original field values as constructor argument, loaded from te _source field or the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
* no highlighting can be performed
*/
public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, BreakIterator breakIterator, String fieldValue, boolean returnNonHighlightedSnippets) {
this.analyzer = analyzer;
this.passageFormatter = passageFormatter;
this.breakIterator = breakIterator;
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
this.fieldValue = fieldValue;
}
/**
* Highlights terms extracted from the provided query within the content of the provided field name
*/
public Snippet[] highlightField(String field, Query query, IndexSearcher searcher, int docId, int maxPassages) throws IOException {
Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, searcher, new int[]{docId}, new int[]{maxPassages});
Object[] snippetObjects = fieldsAsObjects.get(field);
if (snippetObjects != null) {
//one single document at a time
assert snippetObjects.length == 1;
Object snippetObject = snippetObjects[0];
if (snippetObject != null && snippetObject instanceof Snippet[]) {
return (Snippet[]) snippetObject;
}
}
return EMPTY_SNIPPET;
}
@Override
protected PassageFormatter getFormatter(String field) {
return passageFormatter;
}
@Override
protected BreakIterator getBreakIterator(String field) {
if (breakIterator == null) {
return super.getBreakIterator(field);
}
return breakIterator;
}
/*
By default the postings highlighter returns non highlighted snippet when there are no matches.
We want to return no snippets by default, unless no_match_size is greater than 0
*/
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
if (returnNonHighlightedSnippets) {
//we want to return the first sentence of the first snippet only
return super.getEmptyHighlight(fieldName, bi, 1);
}
return EMPTY_PASSAGE;
}
@Override
protected Analyzer getIndexAnalyzer(String field) {
return analyzer;
}
@Override
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
//we only highlight one field, one document at a time
return new String[][]{new String[]{fieldValue}};
}
}

View File

@ -20,7 +20,6 @@
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Snippet;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
/**

View File

@ -27,7 +27,6 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;

View File

@ -17,11 +17,11 @@
* under the License.
*/
package org.apache.lucene.search.highlight;
package org.apache.lucene.search.uhighlight;
/**
* Represents a scored highlighted snippet.
* It's our own arbitrary object that we get back from the postings highlighter when highlighting a document.
* It's our own arbitrary object that we get back from the unified highlighter when highlighting a document.
* Every snippet contains its formatted text and its score.
* The score is needed in case we want to sort snippets by score, they get sorted by position in the text by default.
*/

View File

@ -229,7 +229,6 @@ import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase;
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
import org.elasticsearch.search.rescore.QueryRescorerBuilder;
import org.elasticsearch.search.rescore.RescoreBuilder;
@ -574,7 +573,6 @@ public class SearchModule {
NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
highlighters.register("fvh", new FastVectorHighlighter(settings));
highlighters.register("plain", new PlainHighlighter());
highlighters.register("postings", new PostingsHighlighter());
highlighters.register("unified", new UnifiedHighlighter());
highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);

View File

@ -262,8 +262,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
/**
* Set type of highlighter to use. Out of the box supported types
* are <tt>plain</tt>, <tt>fvh</tt> and <tt>postings</tt>.
* The default option selected is dependent on the mappings defined for your index.
* are <tt>unified</tt>, <tt>plain</tt> and <tt>fvj</tt>.
* Defaults to <tt>unified</tt>.
* Details of the different highlighter types are covered in the reference guide.
*/
@SuppressWarnings("unchecked")

View File

@ -50,7 +50,6 @@ import java.util.Locale;
import java.util.Map;
public class FastVectorHighlighter implements Highlighter {
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));

View File

@ -39,8 +39,6 @@ import java.util.List;
import java.util.Map;
public class HighlightPhase extends AbstractComponent implements FetchSubPhase {
private static final List<String> STANDARD_HIGHLIGHTERS_BY_PRECEDENCE = Arrays.asList("fvh", "postings", "plain");
private final Map<String, Highlighter> highlighters;
public HighlightPhase(Settings settings, Map<String, Highlighter> highlighters) {
@ -94,13 +92,7 @@ public class HighlightPhase extends AbstractComponent implements FetchSubPhase {
}
String highlighterType = field.fieldOptions().highlighterType();
if (highlighterType == null) {
for(String highlighterCandidate : STANDARD_HIGHLIGHTERS_BY_PRECEDENCE) {
if (highlighters.get(highlighterCandidate).canHighlight(fieldMapper)) {
highlighterType = highlighterCandidate;
break;
}
}
assert highlighterType != null;
highlighterType = "unified";
}
Highlighter highlighter = highlighters.get(highlighterType);
if (highlighter == null) {

View File

@ -35,7 +35,7 @@ import static java.util.Collections.singleton;
public final class HighlightUtils {
//U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting (postings highlighter)
//U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting (unified highlighter)
public static final char PARAGRAPH_SEPARATOR = 8233;
public static final char NULL_SEPARATOR = '\u0000';

View File

@ -49,7 +49,6 @@ import java.util.List;
import java.util.Map;
public class PlainHighlighter implements Highlighter {
private static final String CACHE_KEY = "highlight-plain";
@Override

View File

@ -1,195 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.postingshighlight.CustomPassageFormatter;
import org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter;
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils.Encoders;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class PostingsHighlighter implements Highlighter {
private static final String CACHE_KEY = "highlight-postings";
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
if (canHighlight(fieldMapper) == false) {
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
+ "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
}
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(
field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
CustomPostingsHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
//we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
//so we don't lose the distinction between the different values of a field and we get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator,
fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = fieldValues.size(); //we are highlighting the whole content, one snippet per value
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter,
fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = field.fieldOptions().numberOfFragments();
}
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher,
hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch(IOException e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, new Comparator<Snippet>() {
@Override
public int compare(Snippet o1, Snippet o2) {
return (int) Math.signum(o2.getScore() - o1.getScore());
}
});
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
@Override
public boolean canHighlight(FieldMapper fieldMapper) {
return fieldMapper.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
}
static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
//postings highlighter accepts all values in a single string, as offsets etc. need to match with content
//loaded from stored fields, we merge all values using a proper separator
String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
}
static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
//We need to filter the snippets as due to no_match_size we could have
//either highlighted snippets or non highlighted ones and we don't want to mix those up
List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
for (Snippet snippet : snippets) {
if (snippet.isHighlighted()) {
filteredSnippets.add(snippet);
}
}
//if there's at least one highlighted snippet, we return all the highlighted ones
//otherwise we return the first non highlighted one if available
if (filteredSnippets.size() == 0) {
if (snippets.size() > 0) {
Snippet snippet = snippets.get(0);
//if we tried highlighting the whole content using whole break iterator (as number_of_fragments was 0)
//we need to return the first sentence of the content rather than the whole content
if (numberOfFragments == 0) {
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
String text = snippet.getText();
bi.setText(text);
int next = bi.next();
if (next != BreakIterator.DONE) {
String newText = text.substring(0, next).trim();
snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
}
}
filteredSnippets.add(snippet);
}
}
return filteredSnippets;
}
static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
}
static class MapperHighlighterEntry {
final CustomPassageFormatter passageFormatter;
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
this.passageFormatter = passageFormatter;
}
}
}

View File

@ -21,7 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
@ -44,8 +44,6 @@ import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
public class UnifiedHighlighter implements Highlighter {
private static final String CACHE_KEY = "highlight-unified";
@ -174,6 +172,49 @@ public class UnifiedHighlighter implements Highlighter {
}
}
private static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
//We need to filter the snippets as due to no_match_size we could have
//either highlighted snippets or non highlighted ones and we don't want to mix those up
List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
for (Snippet snippet : snippets) {
if (snippet.isHighlighted()) {
filteredSnippets.add(snippet);
}
}
//if there's at least one highlighted snippet, we return all the highlighted ones
//otherwise we return the first non highlighted one if available
if (filteredSnippets.size() == 0) {
if (snippets.size() > 0) {
Snippet snippet = snippets.get(0);
//if we tried highlighting the whole content using whole break iterator (as number_of_fragments was 0)
//we need to return the first sentence of the content rather than the whole content
if (numberOfFragments == 0) {
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
String text = snippet.getText();
bi.setText(text);
int next = bi.next();
if (next != BreakIterator.DONE) {
String newText = text.substring(0, next).trim();
snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
}
}
filteredSnippets.add(snippet);
}
}
return filteredSnippets;
}
private static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
//postings highlighter accepts all values in a single string, as offsets etc. need to match with content
//loaded from stored fields, we merge all values using a proper separator
String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
}
private static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
}

View File

@ -1,105 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.test.ESTestCase;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.notNullValue;
public class CustomPassageFormatterTests extends ESTestCase {
public void testSimpleFormat() {
String content = "This is a really cool highlighter. Postings highlighter gives nice snippets back. No matches here.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
Passage[] passages = new Passage[3];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.startOffset = 0;
passage1.endOffset = end + 2; //lets include the whitespace at the end to make sure we trim it
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.startOffset = passage1.endOffset;
passage2.endOffset = end + 26;
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Passage passage3 = new Passage();
passage3.startOffset = passage2.endOffset;
passage3.endOffset = content.length();
passages[2] = passage3;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(3));
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
assertThat(fragments[0].isHighlighted(), equalTo(true));
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
assertThat(fragments[1].isHighlighted(), equalTo(true));
assertThat(fragments[2].getText(), equalTo("No matches here."));
assertThat(fragments[2].isHighlighted(), equalTo(false));
}
public void testHtmlEncodeFormat() {
String content = "<b>This is a really cool highlighter.</b> Postings highlighter gives nice snippets back.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
Passage[] passages = new Passage[2];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.startOffset = 0;
passage1.endOffset = end + 6; //lets include the whitespace at the end to make sure we trim it
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.startOffset = passage1.endOffset;
passage2.endOffset = content.length();
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(2));
assertThat(fragments[0].getText(), equalTo("&lt;b&gt;This is a really cool <em>highlighter</em>.&lt;&#x2F;b&gt;"));
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
}
}

View File

@ -1,157 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.store.Directory;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
import org.elasticsearch.test.ESTestCase;
import static org.hamcrest.CoreMatchers.equalTo;
public class CustomPostingsHighlighterTests extends ESTestCase {
public void testCustomPostingsHighlighter() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
//good position but only one match
final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue(firstValue);
//two matches, not the best snippet due to its length though
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
Field body2 = new Field("body", "", offsetsType);
doc.add(body2);
body2.setStringValue(secondValue);
//two matches and short, will be scored highest
final String thirdValue = "This is highlighting the third short highlighting value.";
Field body3 = new Field("body", "", offsetsType);
doc.add(body3);
body3.setStringValue(thirdValue);
//one match, same as first but at the end, will be scored lower due to its position
final String fourthValue = "Just a test4 highlighting from postings highlighter.";
Field body4 = new Field("body", "", offsetsType);
doc.add(body4);
body4.setStringValue(fourthValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
IndexSearcher searcher = newSearcher(ir);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
assertThat(snippets.length, equalTo(4));
assertThat(snippets[0].getText(), equalTo(firstHlValue));
assertThat(snippets[1].getText(), equalTo(secondHlValue));
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
ir.close();
dir.close();
}
public void testNoMatchSize() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Field none = new Field("none", "", offsetsType);
Document doc = new Document();
doc.add(body);
doc.add(none);
String firstValue = "This is a test. Just a test highlighting from postings. Feel free to ignore.";
body.setStringValue(firstValue);
none.setStringValue(firstValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
Query query = new TermQuery(new Term("none", "highlighting"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, passageFormatter, firstValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
assertThat(snippets.length, equalTo(0));
highlighter = new CustomPostingsHighlighter(null, passageFormatter, firstValue, true);
snippets = highlighter.highlightField("body", query, searcher, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("This is a test."));
ir.close();
dir.close();
}
}

View File

@ -1,178 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
import org.elasticsearch.test.ESTestCase;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Locale;
import static org.hamcrest.CoreMatchers.equalTo;
public class CustomSeparatorBreakIteratorTests extends ESTestCase {
public void testBreakOnCustomSeparator() throws Exception {
Character separator = randomSeparator();
BreakIterator bi = new CustomSeparatorBreakIterator(separator);
String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
bi.setText(source);
assertThat(bi.current(), equalTo(0));
assertThat(bi.first(), equalTo(0));
assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
assertThat(bi.next(), equalTo(BreakIterator.DONE));
assertThat(bi.last(), equalTo(source.length()));
int current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
current = bi.current();
assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
assertThat(bi.previous(), equalTo(BreakIterator.DONE));
assertThat(bi.current(), equalTo(0));
assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
assertThat(bi.first(), equalTo(0));
assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
}
public void testSingleSentences() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("a", expected, actual);
assertSameBreaks("ab", expected, actual);
assertSameBreaks("abc", expected, actual);
assertSameBreaks("", expected, actual);
}
public void testSliceEnd() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("a000", 0, 1, expected, actual);
assertSameBreaks("ab000", 0, 1, expected, actual);
assertSameBreaks("abc000", 0, 1, expected, actual);
assertSameBreaks("000", 0, 0, expected, actual);
}
public void testSliceStart() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000a", 3, 1, expected, actual);
assertSameBreaks("000ab", 3, 2, expected, actual);
assertSameBreaks("000abc", 3, 3, expected, actual);
assertSameBreaks("000", 3, 0, expected, actual);
}
public void testSliceMiddle() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000a000", 3, 1, expected, actual);
assertSameBreaks("000ab000", 3, 2, expected, actual);
assertSameBreaks("000abc000", 3, 3, expected, actual);
assertSameBreaks("000000", 3, 0, expected, actual);
}
/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
private static char randomSeparator() {
return randomFrom(' ', HighlightUtils.NULL_SEPARATOR, HighlightUtils.PARAGRAPH_SEPARATOR);
}
private static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text),
new StringCharacterIterator(text),
expected,
actual);
}
private static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(text, offset, length, offset, expected, actual);
}
private static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text, offset, offset + length, current),
new StringCharacterIterator(text, offset, offset + length, current),
expected,
actual);
}
/** Asserts that two breakiterators break the text the same way */
private static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
expected.setText(one);
actual.setText(two);
assertEquals(expected.current(), actual.current());
// next()
int v = expected.current();
while (v != BreakIterator.DONE) {
assertEquals(v = expected.next(), actual.next());
assertEquals(expected.current(), actual.current());
}
// first()
assertEquals(expected.first(), actual.first());
assertEquals(expected.current(), actual.current());
// last()
assertEquals(expected.last(), actual.last());
assertEquals(expected.current(), actual.current());
// previous()
v = expected.current();
while (v != BreakIterator.DONE) {
assertEquals(v = expected.previous(), actual.previous());
assertEquals(expected.current(), actual.current());
}
// following()
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
expected.first();
actual.first();
assertEquals(expected.following(i), actual.following(i));
assertEquals(expected.current(), actual.current());
}
// preceding()
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
expected.last();
actual.last();
assertEquals(expected.preceding(i), actual.preceding(i));
assertEquals(expected.current(), actual.current());
}
}
}

View File

@ -19,7 +19,6 @@
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.util.BytesRef;

View File

@ -41,7 +41,6 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.lucene.all.AllTermQuery;

View File

@ -56,7 +56,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.CustomHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.suggest.CustomSuggesterSearchIT.CustomSuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionBuilder;
@ -204,7 +204,7 @@ public class SearchModuleTests extends ModuleTestCase {
Map<String, Highlighter> highlighters = module.getHighlighters();
assertEquals(FastVectorHighlighter.class, highlighters.get("fvh").getClass());
assertEquals(PlainHighlighter.class, highlighters.get("plain").getClass());
assertEquals(PostingsHighlighter.class, highlighters.get("postings").getClass());
assertEquals(UnifiedHighlighter.class, highlighters.get("unified").getClass());
assertSame(highlighters.get("custom"), customHighlighter);
}

View File

@ -852,7 +852,7 @@ public class TopHitsIT extends ESIntegTestCase {
}
public void testNestedFetchFeatures() {
String hlType = randomFrom("plain", "fvh", "postings");
String hlType = randomFrom("plain", "fvh", "unified");
HighlightBuilder.Field hlField = new HighlightBuilder.Field("comments.message")
.highlightQuery(matchQuery("comments.message", "comment"))
.forceSource(randomBoolean()) // randomly from stored field or _source

View File

@ -312,8 +312,7 @@ disk space and, because it is a combination of other fields, it may result in
odd highlighting results.
The `_all` field also accepts the `term_vector` and `index_options`
parameters, allowing the use of the fast vector highlighter and the postings
highlighter.
parameters, allowing highlighting to use it.
[[all-highlight-fields]]
===== Highlight original fields

View File

@ -26,7 +26,7 @@ following settings:
Doc number, term frequencies, positions, and start and end character
offsets (which map the term back to the original string) are indexed.
Offsets are used by the <<postings-highlighter,postings highlighter>>.
Offsets are used by the <<unified-highlighter,unified highlighter>> to speed up highlighting.
<<mapping-index,Analyzed>> string fields use `positions` as the default, and
all other fields use `docs` as the default.
@ -67,4 +67,4 @@ GET my_index/_search
}
--------------------------------------------------
// CONSOLE
<1> The `text` field will use the postings highlighter by default because `offsets` are indexed.
<1> The `text` field will use the postings for the highlighting by default because `offsets` are indexed.

View File

@ -188,7 +188,7 @@ accessed within the scope of the `nested` query, the
For instance, if a string field within a nested document has
<<index-options,`index_options`>> set to `offsets` to allow use of the postings
highlighter, these offsets will not be available during the main highlighting
during the highlighting, these offsets will not be available during the main highlighting
phase. Instead, highlighting needs to be performed via
<<nested-inner-hits,nested inner hits>>.

View File

@ -98,3 +98,14 @@ but the only reason why it has not been deprecated too is because it is used
for the `random_score` function. If you really need access to the id of
documents for sorting, aggregations or search scripts, the recommandation is
to duplicate the id as a field in the document.
==== Highlighers
The `unified` highlighter is the new default choice for highlighter.
The offset strategy for each field is picked internally by this highlighter depending on the
type of the field (`index_options`).
It is still possible to force the highlighter to `fvh` or `plain` types.
The `postings` highlighter has been removed from Lucene and Elasticsearch.
The `unified` highlighter outputs the same highlighting when `index_options` is set
to `offsets`.

View File

@ -1,9 +1,8 @@
[[search-request-highlighting]]
=== Highlighting
Allows to highlight search results on one or more fields. The
implementation uses either the lucene `plain` highlighter, the
fast vector highlighter (`fvh`) or `postings` highlighter.
Highlighters allow you to produce highlighted snippets from one or more fields
in your search results.
The following is an example of the search request body:
[source,js]
@ -45,35 +44,48 @@ from versions before 5.0) that match the expression to be highlighted.
Note that all other fields will not be highlighted. If you use a custom mapper and want to
highlight on a field anyway, you have to provide the field name explicitly.
[[plain-highlighter]]
==== Plain highlighter
[[unified-highlighter]]
==== Unified Highlighter
The default choice of highlighter is of type `plain` and uses the Lucene highlighter.
It tries hard to reflect the query matching logic in terms of understanding word importance and any word positioning criteria in phrase queries.
The unified highlighter (which is used by default if no highlighter type is specified)
uses the Lucene Unified Highlighter.
This highlighter breaks the text into sentences and scores individual sentences as
if they were documents in this corpus, using the BM25 algorithm.
It also supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting.
[WARNING]
If you want to highlight a lot of fields in a lot of documents with complex queries this highlighter will not be fast.
In its efforts to accurately reflect query logic it creates a tiny in-memory index and re-runs the original query criteria through
[float]
===== Offsets Strategy
In order to create meaningful search snippets from the terms being queried,
a highlighter needs to know the start and end character offsets of each word
in the original text.
These offsets can be obtained from:
* The postings list (fields mapped as "index_options": "offsets").
* Term vectors (fields mapped as "term_vectors": "with_positions_offsets").
* The original field, by reanalysing the text on-the-fly.
[float]
====== Plain highlighting
This mode is picked when there is no other alternative.
It creates a tiny in-memory index and re-runs the original query criteria through
Lucene's query execution planner to get access to low-level match information on the current document.
This is repeated for every field and every document that needs highlighting. If this presents a performance issue in your system consider using an alternative highlighter.
This is repeated for every field and every document that needs highlighting.
[[postings-highlighter]]
==== Postings highlighter
[float]
====== Postings
If `index_options` is set to `offsets` in the mapping the postings highlighter
will be used instead of the plain highlighter. The postings highlighter:
* Is faster since it doesn't require to reanalyze the text to be highlighted:
the larger the documents the better the performance gain should be
* Requires less disk space than term_vectors, needed for the fast vector
highlighter
* Breaks the text into sentences and highlights them. Plays really well with
natural languages, not as well with fields containing for instance html markup
* Treats the document as the whole corpus, and scores individual sentences as
if they were documents in this corpus, using the BM25 algorithm
If `index_options` is set to `offsets` in the mapping the `unified` highlighter
will use this information to highlight documents without re-analyzing the text.
It re-runs the original query directly on the postings and extracts the matching offsets
directly from the index limiting the collection to the highlighted documents.
This mode is faster on large fields since it doesn't require to reanalyze the text to be highlighted
and requires less disk space than term_vectors, needed for the fast vector
highlighting.
Here is an example of setting the `comment` field in the index mapping to allow for
highlighting using the postings highlighter on it:
highlighting using the postings:
[source,js]
--------------------------------------------------
@ -93,24 +105,56 @@ PUT /example
--------------------------------------------------
// CONSOLE
[NOTE]
Note that the postings highlighter is meant to perform simple query terms
highlighting, regardless of their positions. That means that when used for
instance in combination with a phrase query, it will highlight all the terms
that the query is composed of, regardless of whether they are actually part of
a query match, effectively ignoring their positions.
[float]
====== Term Vectors
If `term_vector` information is provided by setting `term_vector` to
`with_positions_offsets` in the mapping then the `unified` highlighter
will automatically use the `term_vector` to highlight the field.
The `term_vector` highlighting is faster to highlight multi-term queries like
`prefix` or `wildcard` because it can access the dictionary of term for each document
but it is also usually more costly than using the `postings` directly.
Here is an example of setting the `comment` field to allow for
highlighting using the `term_vectors` (this will cause the index to be bigger):
[source,js]
--------------------------------------------------
PUT /example
{
"mappings": {
"doc" : {
"properties": {
"comment" : {
"type": "text",
"term_vector" : "with_positions_offsets"
}
}
}
}
}
--------------------------------------------------
// CONSOLE
[[plain-highlighter]]
==== Plain highlighter
This highlighter of type `plain` uses the standard Lucene highlighter.
It tries hard to reflect the query matching logic in terms of understanding word importance and any word positioning criteria in phrase queries.
[WARNING]
The postings highlighter doesn't support highlighting some complex queries,
like a `match` query with `type` set to `match_phrase_prefix`. No highlighted
snippets will be returned in that case.
If you want to highlight a lot of fields in a lot of documents with complex queries this highlighter will not be fast.
In its efforts to accurately reflect query logic it creates a tiny in-memory index and re-runs the original query criteria through
Lucene's query execution planner to get access to low-level match information on the current document.
This is repeated for every field and every document that needs highlighting. If this presents a performance issue in your system consider using an alternative highlighter.
[[fast-vector-highlighter]]
==== Fast vector highlighter
If `term_vector` information is provided by setting `term_vector` to
`with_positions_offsets` in the mapping then the fast vector highlighter
will be used instead of the plain highlighter. The fast vector highlighter:
This highlighter of type `fvh` uses the Lucene Fast Vector highlighter.
This highlighter can be used on fields with `term_vector` set to
`with_positions_offsets` in the mapping.
The fast vector highlighter:
* Is faster especially for large fields (> `1MB`)
* Can be customized with `boundary_scanner` (see <<boundary-scanners,below>>)
@ -144,30 +188,10 @@ PUT /example
--------------------------------------------------
// CONSOLE
==== Unified Highlighter
experimental[]
The `unified` highlighter can extract offsets from either postings, term vectors, or via re-analyzing text.
Under the hood it uses Lucene UnifiedHighlighter which picks its strategy depending on the field and the query to highlight.
Independently of the strategy this highlighter breaks the text into sentences and scores individual sentences as
if they were documents in this corpus, using the BM25 algorithm.
It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting and can be used with the following options:
* `force_source`
* `encoder`
* `highlight_query`
* `pre_tags and `post_tags`
* `require_field_match`
* `boundary_scanner` (`sentence` (**default**) or `word`)
* `max_fragment_length` (only for `sentence` scanner)
* `no_match_size`
==== Force highlighter type
The `type` field allows to force a specific highlighter type. This is useful
for instance when needing to use the plain highlighter on a field that has
`term_vectors` enabled. The allowed values are: `plain`, `postings` and `fvh`.
The `type` field allows to force a specific highlighter type.
The allowed values are: `unified`, `plain` and `fvh`.
The following is an example that forces the use of the plain highlighter:
[source,js]
@ -320,9 +344,6 @@ GET /_search
// CONSOLE
// TEST[setup:twitter]
The `fragment_size` is ignored when using the postings highlighter, as it
outputs sentences regardless of their length.
On top of this it is possible to specify that highlighted fragments need
to be sorted by score:
@ -375,10 +396,7 @@ In the case where there is no matching fragment to highlight, the default is
to not return anything. Instead, we can return a snippet of text from the
beginning of the field by setting `no_match_size` (default `0`) to the length
of the text that you want returned. The actual length may be shorter or longer than
specified as it tries to break on a word boundary. When using the postings
highlighter it is not possible to control the actual size of the snippet,
therefore the first sentence gets returned whenever `no_match_size` is
greater than `0`.
specified as it tries to break on a word boundary.
[source,js]
--------------------------------------------------
@ -403,6 +421,8 @@ GET /_search
==== Fragmenter
WARNING: This option is not supported by the `unified` highlighter
Fragmenter can control how text should be broken up in highlight snippets.
However, this option is applicable only for the Plain Highlighter.
There are two options:
@ -421,6 +441,7 @@ GET twitter/tweet/_search
"highlight" : {
"fields" : {
"message" : {
"type": "plain",
"fragment_size" : 15,
"number_of_fragments" : 3,
"fragmenter": "simple"
@ -476,6 +497,7 @@ GET twitter/tweet/_search
"highlight" : {
"fields" : {
"message" : {
"type": "plain",
"fragment_size" : 15,
"number_of_fragments" : 3,
"fragmenter": "span"
@ -596,12 +618,6 @@ GET /_search
// CONSOLE
// TEST[setup:twitter]
Note that the score of text fragment in this case is calculated by the Lucene
highlighting framework. For implementation details you can check the
`ScoreOrderFragmentsBuilder.java` class. On the other hand when using the
postings highlighter the fragments are scored using, as mentioned above,
the BM25 algorithm.
[[highlighting-settings]]
==== Global Settings
@ -681,6 +697,9 @@ You can set `fragment_size` to 0 to never split any sentence.
[[matched-fields]]
==== Matched Fields
WARNING: This is only supported by the `fvh` highlighter
The Fast Vector Highlighter can combine matches on multiple fields to
highlight a single field using `matched_fields`. This is most
intuitive for multifields that analyze the same string in different
@ -814,6 +833,9 @@ to
[[phrase-limit]]
==== Phrase Limit
WARNING: this is only supported by the `fvh` highlighter
The fast vector highlighter has a `phrase_limit` parameter that prevents
it from analyzing too many phrases and eating tons of memory. It defaults
to 256 so only the first 256 matching phrases in the document scored

View File

@ -136,7 +136,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
refresh();
SearchResponse search = client().prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
.highlighter(new HighlightBuilder().field("body")).get();
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).get();
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
search = client()
.prepareSearch()
@ -146,7 +146,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
+ "is a test for highlighting feature"))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).execute().actionGet();
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));