Remove the postings highlighter and make unified the default highlighter choice (#25028)
This change removes the `postings` highlighter. This highlighter has been removed from Lucene master (7.x) because it behaves exactly like the `unified` highlighter when index_options is set to `offsets`: https://issues.apache.org/jira/browse/LUCENE-7815 It also makes the `unified` highlighter the default choice for highlighting a field (if `type` is not provided). The strategy used internally by this highlighter remain the same as before, it checks `term_vectors` first, then `postings` and ultimately it re-analyzes the text. Ultimately it rewrites the docs so that the options that the `unified` highlighter cannot handle are clearly marked as such. There are few features that the `unified` highlighter is not able to handle which is why the other highlighters (`plain` and `fvh`) are still available. I'll open separate issues for these features and we'll deprecate the `fvh` and `plain` highlighters when full support for these features have been added to the `unified`.
This commit is contained in:
parent
eca4f24b16
commit
8250aa4267
|
@ -16,7 +16,6 @@
|
|||
<!-- Hopefully temporary suppression of LineLength on files that don't pass it. We should remove these when we the
|
||||
files start to pass. -->
|
||||
<suppress files="client[/\\]rest[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]client[/\\]HeapBufferedAsyncResponseConsumerTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]postingshighlight[/\\]CustomPostingsHighlighter.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]vectorhighlight[/\\]CustomFieldQuery.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]Action.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]ActionRequestBuilder.java" checks="LineLength" />
|
||||
|
@ -428,7 +427,6 @@
|
|||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]threadpool[/\\]ThreadPool.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]tribe[/\\]TribeService.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]queries[/\\]BlendedTermQueryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]apache[/\\]lucene[/\\]search[/\\]postingshighlight[/\\]CustomPostingsHighlighterTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]VersionTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]RejectionActionIT.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]action[/\\]admin[/\\]HotThreadsIT.java" checks="LineLength" />
|
||||
|
|
|
@ -1,82 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
||||
|
||||
/**
|
||||
Custom passage formatter that allows us to:
|
||||
1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet})
|
||||
2) use the {@link Encoder} implementations that are already used with the other highlighters
|
||||
*/
|
||||
public class CustomPassageFormatter extends PassageFormatter {
|
||||
|
||||
private final String preTag;
|
||||
private final String postTag;
|
||||
private final Encoder encoder;
|
||||
|
||||
public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) {
|
||||
this.preTag = preTag;
|
||||
this.postTag = postTag;
|
||||
this.encoder = encoder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Snippet[] format(Passage[] passages, String content) {
|
||||
Snippet[] snippets = new Snippet[passages.length];
|
||||
int pos;
|
||||
for (int j = 0; j < passages.length; j++) {
|
||||
Passage passage = passages[j];
|
||||
StringBuilder sb = new StringBuilder();
|
||||
pos = passage.getStartOffset();
|
||||
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||
int start = passage.getMatchStarts()[i];
|
||||
int end = passage.getMatchEnds()[i];
|
||||
// its possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
append(sb, content, pos, start);
|
||||
}
|
||||
if (end > pos) {
|
||||
sb.append(preTag);
|
||||
append(sb, content, Math.max(pos, start), end);
|
||||
sb.append(postTag);
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
|
||||
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
|
||||
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
|
||||
sb.deleteCharAt(sb.length() - 1);
|
||||
} else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
|
||||
sb.deleteCharAt(sb.length() - 1);
|
||||
}
|
||||
//and we trim the snippets too
|
||||
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
|
||||
}
|
||||
return snippets;
|
||||
}
|
||||
|
||||
protected void append(StringBuilder dest, String content, int start, int end) {
|
||||
dest.append(encoder.encodeText(content.substring(start, end)));
|
||||
}
|
||||
}
|
|
@ -1,138 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Subclass of the {@link PostingsHighlighter} that works for a single field in a single document.
|
||||
* Uses a custom {@link PassageFormatter}. Accepts field content as a constructor argument, given that loading
|
||||
* is custom and can be done reading from _source field. Supports using different {@link BreakIterator} to break
|
||||
* the text into fragments. Considers every distinct field value as a discrete passage for highlighting (unless
|
||||
* the whole content needs to be highlighted). Supports both returning empty snippets and non highlighted snippets
|
||||
* when no highlighting can be performed.
|
||||
*
|
||||
* The use that we make of the postings highlighter is not optimal. It would be much better to highlight
|
||||
* multiple docs in a single call, as we actually lose its sequential IO. That would require to
|
||||
* refactor the elasticsearch highlight api which currently works per hit.
|
||||
*/
|
||||
public final class CustomPostingsHighlighter extends PostingsHighlighter {
|
||||
|
||||
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
|
||||
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
|
||||
|
||||
private final Analyzer analyzer;
|
||||
private final CustomPassageFormatter passageFormatter;
|
||||
private final BreakIterator breakIterator;
|
||||
private final boolean returnNonHighlightedSnippets;
|
||||
private final String fieldValue;
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomPostingsHighlighter}
|
||||
*
|
||||
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
|
||||
* @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
|
||||
* @param fieldValue the original field values as constructor argument, loaded from te _source field or the relevant stored field.
|
||||
* @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
|
||||
* no highlighting can be performed
|
||||
*/
|
||||
public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, String fieldValue, boolean returnNonHighlightedSnippets) {
|
||||
this(analyzer, passageFormatter, null, fieldValue, returnNonHighlightedSnippets);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomPostingsHighlighter}
|
||||
*
|
||||
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
|
||||
* @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
|
||||
* @param breakIterator an instance {@link BreakIterator} selected depending on the highlighting options
|
||||
* @param fieldValue the original field values as constructor argument, loaded from te _source field or the relevant stored field.
|
||||
* @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
|
||||
* no highlighting can be performed
|
||||
*/
|
||||
public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, BreakIterator breakIterator, String fieldValue, boolean returnNonHighlightedSnippets) {
|
||||
this.analyzer = analyzer;
|
||||
this.passageFormatter = passageFormatter;
|
||||
this.breakIterator = breakIterator;
|
||||
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
|
||||
this.fieldValue = fieldValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights terms extracted from the provided query within the content of the provided field name
|
||||
*/
|
||||
public Snippet[] highlightField(String field, Query query, IndexSearcher searcher, int docId, int maxPassages) throws IOException {
|
||||
Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, searcher, new int[]{docId}, new int[]{maxPassages});
|
||||
Object[] snippetObjects = fieldsAsObjects.get(field);
|
||||
if (snippetObjects != null) {
|
||||
//one single document at a time
|
||||
assert snippetObjects.length == 1;
|
||||
Object snippetObject = snippetObjects[0];
|
||||
if (snippetObject != null && snippetObject instanceof Snippet[]) {
|
||||
return (Snippet[]) snippetObject;
|
||||
}
|
||||
}
|
||||
return EMPTY_SNIPPET;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
return passageFormatter;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
if (breakIterator == null) {
|
||||
return super.getBreakIterator(field);
|
||||
}
|
||||
return breakIterator;
|
||||
}
|
||||
|
||||
/*
|
||||
By default the postings highlighter returns non highlighted snippet when there are no matches.
|
||||
We want to return no snippets by default, unless no_match_size is greater than 0
|
||||
*/
|
||||
@Override
|
||||
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
|
||||
if (returnNonHighlightedSnippets) {
|
||||
//we want to return the first sentence of the first snippet only
|
||||
return super.getEmptyHighlight(fieldName, bi, 1);
|
||||
}
|
||||
return EMPTY_PASSAGE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
|
||||
//we only highlight one field, one document at a time
|
||||
return new String[][]{new String[]{fieldValue}};
|
||||
}
|
||||
}
|
|
@ -20,7 +20,6 @@
|
|||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
||||
|
||||
/**
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
|
|
|
@ -17,11 +17,11 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.highlight;
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
/**
|
||||
* Represents a scored highlighted snippet.
|
||||
* It's our own arbitrary object that we get back from the postings highlighter when highlighting a document.
|
||||
* It's our own arbitrary object that we get back from the unified highlighter when highlighting a document.
|
||||
* Every snippet contains its formatted text and its score.
|
||||
* The score is needed in case we want to sort snippets by score, they get sorted by position in the text by default.
|
||||
*/
|
|
@ -229,7 +229,6 @@ import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
|||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
|
||||
import org.elasticsearch.search.rescore.QueryRescorerBuilder;
|
||||
import org.elasticsearch.search.rescore.RescoreBuilder;
|
||||
|
@ -574,7 +573,6 @@ public class SearchModule {
|
|||
NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
|
||||
highlighters.register("fvh", new FastVectorHighlighter(settings));
|
||||
highlighters.register("plain", new PlainHighlighter());
|
||||
highlighters.register("postings", new PostingsHighlighter());
|
||||
highlighters.register("unified", new UnifiedHighlighter());
|
||||
highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
|
||||
|
||||
|
|
|
@ -262,8 +262,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
|
|||
|
||||
/**
|
||||
* Set type of highlighter to use. Out of the box supported types
|
||||
* are <tt>plain</tt>, <tt>fvh</tt> and <tt>postings</tt>.
|
||||
* The default option selected is dependent on the mappings defined for your index.
|
||||
* are <tt>unified</tt>, <tt>plain</tt> and <tt>fvj</tt>.
|
||||
* Defaults to <tt>unified</tt>.
|
||||
* Details of the different highlighter types are covered in the reference guide.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
|
|
|
@ -50,7 +50,6 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
|
||||
public class FastVectorHighlighter implements Highlighter {
|
||||
|
||||
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
|
||||
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
|
||||
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));
|
||||
|
|
|
@ -39,8 +39,6 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
public class HighlightPhase extends AbstractComponent implements FetchSubPhase {
|
||||
private static final List<String> STANDARD_HIGHLIGHTERS_BY_PRECEDENCE = Arrays.asList("fvh", "postings", "plain");
|
||||
|
||||
private final Map<String, Highlighter> highlighters;
|
||||
|
||||
public HighlightPhase(Settings settings, Map<String, Highlighter> highlighters) {
|
||||
|
@ -94,13 +92,7 @@ public class HighlightPhase extends AbstractComponent implements FetchSubPhase {
|
|||
}
|
||||
String highlighterType = field.fieldOptions().highlighterType();
|
||||
if (highlighterType == null) {
|
||||
for(String highlighterCandidate : STANDARD_HIGHLIGHTERS_BY_PRECEDENCE) {
|
||||
if (highlighters.get(highlighterCandidate).canHighlight(fieldMapper)) {
|
||||
highlighterType = highlighterCandidate;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert highlighterType != null;
|
||||
highlighterType = "unified";
|
||||
}
|
||||
Highlighter highlighter = highlighters.get(highlighterType);
|
||||
if (highlighter == null) {
|
||||
|
|
|
@ -35,7 +35,7 @@ import static java.util.Collections.singleton;
|
|||
|
||||
public final class HighlightUtils {
|
||||
|
||||
//U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting (postings highlighter)
|
||||
//U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting (unified highlighter)
|
||||
public static final char PARAGRAPH_SEPARATOR = 8233;
|
||||
public static final char NULL_SEPARATOR = '\u0000';
|
||||
|
||||
|
|
|
@ -49,7 +49,6 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
public class PlainHighlighter implements Highlighter {
|
||||
|
||||
private static final String CACHE_KEY = "highlight-plain";
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.search.fetch.subphase.highlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.postingshighlight.CustomPassageFormatter;
|
||||
import org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter;
|
||||
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils.Encoders;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
public class PostingsHighlighter implements Highlighter {
|
||||
|
||||
private static final String CACHE_KEY = "highlight-postings";
|
||||
|
||||
@Override
|
||||
public HighlightField highlight(HighlighterContext highlighterContext) {
|
||||
|
||||
FieldMapper fieldMapper = highlighterContext.mapper;
|
||||
SearchContextHighlight.Field field = highlighterContext.field;
|
||||
if (canHighlight(fieldMapper) == false) {
|
||||
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
|
||||
+ "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
|
||||
}
|
||||
|
||||
SearchContext context = highlighterContext.context;
|
||||
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
|
||||
|
||||
if (!hitContext.cache().containsKey(CACHE_KEY)) {
|
||||
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
|
||||
}
|
||||
|
||||
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
|
||||
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
|
||||
|
||||
if (mapperHighlighterEntry == null) {
|
||||
Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(
|
||||
field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
|
||||
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
|
||||
}
|
||||
|
||||
List<Snippet> snippets = new ArrayList<>();
|
||||
int numberOfFragments;
|
||||
try {
|
||||
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
|
||||
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
|
||||
CustomPostingsHighlighter highlighter;
|
||||
if (field.fieldOptions().numberOfFragments() == 0) {
|
||||
//we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
|
||||
//so we don't lose the distinction between the different values of a field and we get back a snippet per value
|
||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
|
||||
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
|
||||
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator,
|
||||
fieldValue, field.fieldOptions().noMatchSize() > 0);
|
||||
numberOfFragments = fieldValues.size(); //we are highlighting the whole content, one snippet per value
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
|
||||
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter,
|
||||
fieldValue, field.fieldOptions().noMatchSize() > 0);
|
||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
}
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
|
||||
Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher,
|
||||
hitContext.docId(), numberOfFragments);
|
||||
for (Snippet fieldSnippet : fieldSnippets) {
|
||||
if (Strings.hasText(fieldSnippet.getText())) {
|
||||
snippets.add(fieldSnippet);
|
||||
}
|
||||
}
|
||||
|
||||
} catch(IOException e) {
|
||||
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
|
||||
}
|
||||
|
||||
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
|
||||
|
||||
if (field.fieldOptions().scoreOrdered()) {
|
||||
//let's sort the snippets by score if needed
|
||||
CollectionUtil.introSort(snippets, new Comparator<Snippet>() {
|
||||
@Override
|
||||
public int compare(Snippet o1, Snippet o2) {
|
||||
return (int) Math.signum(o2.getScore() - o1.getScore());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
String[] fragments = new String[snippets.size()];
|
||||
for (int i = 0; i < fragments.length; i++) {
|
||||
fragments[i] = snippets.get(i).getText();
|
||||
}
|
||||
|
||||
if (fragments.length > 0) {
|
||||
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canHighlight(FieldMapper fieldMapper) {
|
||||
return fieldMapper.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
}
|
||||
|
||||
static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
|
||||
//postings highlighter accepts all values in a single string, as offsets etc. need to match with content
|
||||
//loaded from stored fields, we merge all values using a proper separator
|
||||
String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
|
||||
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
|
||||
}
|
||||
|
||||
static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
|
||||
|
||||
//We need to filter the snippets as due to no_match_size we could have
|
||||
//either highlighted snippets or non highlighted ones and we don't want to mix those up
|
||||
List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
|
||||
for (Snippet snippet : snippets) {
|
||||
if (snippet.isHighlighted()) {
|
||||
filteredSnippets.add(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
//if there's at least one highlighted snippet, we return all the highlighted ones
|
||||
//otherwise we return the first non highlighted one if available
|
||||
if (filteredSnippets.size() == 0) {
|
||||
if (snippets.size() > 0) {
|
||||
Snippet snippet = snippets.get(0);
|
||||
//if we tried highlighting the whole content using whole break iterator (as number_of_fragments was 0)
|
||||
//we need to return the first sentence of the content rather than the whole content
|
||||
if (numberOfFragments == 0) {
|
||||
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
String text = snippet.getText();
|
||||
bi.setText(text);
|
||||
int next = bi.next();
|
||||
if (next != BreakIterator.DONE) {
|
||||
String newText = text.substring(0, next).trim();
|
||||
snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
|
||||
}
|
||||
}
|
||||
filteredSnippets.add(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
return filteredSnippets;
|
||||
}
|
||||
|
||||
static class HighlighterEntry {
|
||||
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
|
||||
}
|
||||
|
||||
static class MapperHighlighterEntry {
|
||||
final CustomPassageFormatter passageFormatter;
|
||||
|
||||
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
|
||||
this.passageFormatter = passageFormatter;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -21,7 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.uhighlight.Snippet;
|
||||
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
|
||||
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
|
||||
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
|
||||
|
@ -44,8 +44,6 @@ import java.util.Map;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
|
||||
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
|
||||
|
||||
public class UnifiedHighlighter implements Highlighter {
|
||||
private static final String CACHE_KEY = "highlight-unified";
|
||||
|
@ -174,6 +172,49 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
}
|
||||
}
|
||||
|
||||
private static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
|
||||
|
||||
//We need to filter the snippets as due to no_match_size we could have
|
||||
//either highlighted snippets or non highlighted ones and we don't want to mix those up
|
||||
List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
|
||||
for (Snippet snippet : snippets) {
|
||||
if (snippet.isHighlighted()) {
|
||||
filteredSnippets.add(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
//if there's at least one highlighted snippet, we return all the highlighted ones
|
||||
//otherwise we return the first non highlighted one if available
|
||||
if (filteredSnippets.size() == 0) {
|
||||
if (snippets.size() > 0) {
|
||||
Snippet snippet = snippets.get(0);
|
||||
//if we tried highlighting the whole content using whole break iterator (as number_of_fragments was 0)
|
||||
//we need to return the first sentence of the content rather than the whole content
|
||||
if (numberOfFragments == 0) {
|
||||
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
String text = snippet.getText();
|
||||
bi.setText(text);
|
||||
int next = bi.next();
|
||||
if (next != BreakIterator.DONE) {
|
||||
String newText = text.substring(0, next).trim();
|
||||
snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
|
||||
}
|
||||
}
|
||||
filteredSnippets.add(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
return filteredSnippets;
|
||||
}
|
||||
|
||||
private static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
|
||||
//postings highlighter accepts all values in a single string, as offsets etc. need to match with content
|
||||
//loaded from stored fields, we merge all values using a proper separator
|
||||
String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
|
||||
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
|
||||
}
|
||||
|
||||
|
||||
private static class HighlighterEntry {
|
||||
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
|
||||
}
|
||||
|
|
|
@ -1,105 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
import static org.hamcrest.CoreMatchers.notNullValue;
|
||||
|
||||
|
||||
public class CustomPassageFormatterTests extends ESTestCase {
|
||||
public void testSimpleFormat() {
|
||||
String content = "This is a really cool highlighter. Postings highlighter gives nice snippets back. No matches here.";
|
||||
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
|
||||
|
||||
Passage[] passages = new Passage[3];
|
||||
String match = "highlighter";
|
||||
BytesRef matchBytesRef = new BytesRef(match);
|
||||
|
||||
Passage passage1 = new Passage();
|
||||
int start = content.indexOf(match);
|
||||
int end = start + match.length();
|
||||
passage1.startOffset = 0;
|
||||
passage1.endOffset = end + 2; //lets include the whitespace at the end to make sure we trim it
|
||||
passage1.addMatch(start, end, matchBytesRef);
|
||||
passages[0] = passage1;
|
||||
|
||||
Passage passage2 = new Passage();
|
||||
start = content.lastIndexOf(match);
|
||||
end = start + match.length();
|
||||
passage2.startOffset = passage1.endOffset;
|
||||
passage2.endOffset = end + 26;
|
||||
passage2.addMatch(start, end, matchBytesRef);
|
||||
passages[1] = passage2;
|
||||
|
||||
Passage passage3 = new Passage();
|
||||
passage3.startOffset = passage2.endOffset;
|
||||
passage3.endOffset = content.length();
|
||||
passages[2] = passage3;
|
||||
|
||||
Snippet[] fragments = passageFormatter.format(passages, content);
|
||||
assertThat(fragments, notNullValue());
|
||||
assertThat(fragments.length, equalTo(3));
|
||||
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
|
||||
assertThat(fragments[0].isHighlighted(), equalTo(true));
|
||||
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
|
||||
assertThat(fragments[1].isHighlighted(), equalTo(true));
|
||||
assertThat(fragments[2].getText(), equalTo("No matches here."));
|
||||
assertThat(fragments[2].isHighlighted(), equalTo(false));
|
||||
}
|
||||
|
||||
public void testHtmlEncodeFormat() {
|
||||
String content = "<b>This is a really cool highlighter.</b> Postings highlighter gives nice snippets back.";
|
||||
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
|
||||
|
||||
Passage[] passages = new Passage[2];
|
||||
String match = "highlighter";
|
||||
BytesRef matchBytesRef = new BytesRef(match);
|
||||
|
||||
Passage passage1 = new Passage();
|
||||
int start = content.indexOf(match);
|
||||
int end = start + match.length();
|
||||
passage1.startOffset = 0;
|
||||
passage1.endOffset = end + 6; //lets include the whitespace at the end to make sure we trim it
|
||||
passage1.addMatch(start, end, matchBytesRef);
|
||||
passages[0] = passage1;
|
||||
|
||||
Passage passage2 = new Passage();
|
||||
start = content.lastIndexOf(match);
|
||||
end = start + match.length();
|
||||
passage2.startOffset = passage1.endOffset;
|
||||
passage2.endOffset = content.length();
|
||||
passage2.addMatch(start, end, matchBytesRef);
|
||||
passages[1] = passage2;
|
||||
|
||||
Snippet[] fragments = passageFormatter.format(passages, content);
|
||||
assertThat(fragments, notNullValue());
|
||||
assertThat(fragments.length, equalTo(2));
|
||||
assertThat(fragments[0].getText(), equalTo("<b>This is a really cool <em>highlighter</em>.</b>"));
|
||||
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
|
||||
}
|
||||
}
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
|
||||
public class CustomPostingsHighlighterTests extends ESTestCase {
|
||||
public void testCustomPostingsHighlighter() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
|
||||
//good position but only one match
|
||||
final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
body.setStringValue(firstValue);
|
||||
|
||||
//two matches, not the best snippet due to its length though
|
||||
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
|
||||
Field body2 = new Field("body", "", offsetsType);
|
||||
doc.add(body2);
|
||||
body2.setStringValue(secondValue);
|
||||
|
||||
//two matches and short, will be scored highest
|
||||
final String thirdValue = "This is highlighting the third short highlighting value.";
|
||||
Field body3 = new Field("body", "", offsetsType);
|
||||
doc.add(body3);
|
||||
body3.setStringValue(thirdValue);
|
||||
|
||||
//one match, same as first but at the end, will be scored lower due to its position
|
||||
final String fourthValue = "Just a test4 highlighting from postings highlighter.";
|
||||
Field body4 = new Field("body", "", offsetsType);
|
||||
doc.add(body4);
|
||||
body4.setStringValue(fourthValue);
|
||||
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
|
||||
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
|
||||
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
|
||||
String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Query query = new TermQuery(new Term("body", "highlighting"));
|
||||
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
|
||||
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
|
||||
|
||||
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
|
||||
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
|
||||
|
||||
assertThat(snippets.length, equalTo(4));
|
||||
|
||||
assertThat(snippets[0].getText(), equalTo(firstHlValue));
|
||||
assertThat(snippets[1].getText(), equalTo(secondHlValue));
|
||||
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
|
||||
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testNoMatchSize() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
Field body = new Field("body", "", offsetsType);
|
||||
Field none = new Field("none", "", offsetsType);
|
||||
Document doc = new Document();
|
||||
doc.add(body);
|
||||
doc.add(none);
|
||||
|
||||
String firstValue = "This is a test. Just a test highlighting from postings. Feel free to ignore.";
|
||||
body.setStringValue(firstValue);
|
||||
none.setStringValue(firstValue);
|
||||
iw.addDocument(doc);
|
||||
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
|
||||
Query query = new TermQuery(new Term("none", "highlighting"));
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits, equalTo(1));
|
||||
int docId = topDocs.scoreDocs[0].doc;
|
||||
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
|
||||
|
||||
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, passageFormatter, firstValue, false);
|
||||
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
|
||||
assertThat(snippets.length, equalTo(0));
|
||||
|
||||
highlighter = new CustomPostingsHighlighter(null, passageFormatter, firstValue, true);
|
||||
snippets = highlighter.highlightField("body", query, searcher, docId, 5);
|
||||
assertThat(snippets.length, equalTo(1));
|
||||
assertThat(snippets[0].getText(), equalTo("This is a test."));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.postingshighlight;
|
||||
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
|
||||
public class CustomSeparatorBreakIteratorTests extends ESTestCase {
|
||||
public void testBreakOnCustomSeparator() throws Exception {
|
||||
Character separator = randomSeparator();
|
||||
BreakIterator bi = new CustomSeparatorBreakIterator(separator);
|
||||
String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
|
||||
bi.setText(source);
|
||||
assertThat(bi.current(), equalTo(0));
|
||||
assertThat(bi.first(), equalTo(0));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
|
||||
assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
|
||||
assertThat(bi.next(), equalTo(BreakIterator.DONE));
|
||||
|
||||
assertThat(bi.last(), equalTo(source.length()));
|
||||
int current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
|
||||
current = bi.current();
|
||||
assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
|
||||
assertThat(bi.previous(), equalTo(BreakIterator.DONE));
|
||||
assertThat(bi.current(), equalTo(0));
|
||||
|
||||
assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
|
||||
|
||||
assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
|
||||
|
||||
assertThat(bi.first(), equalTo(0));
|
||||
assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
|
||||
}
|
||||
|
||||
public void testSingleSentences() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("a", expected, actual);
|
||||
assertSameBreaks("ab", expected, actual);
|
||||
assertSameBreaks("abc", expected, actual);
|
||||
assertSameBreaks("", expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceEnd() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("a000", 0, 1, expected, actual);
|
||||
assertSameBreaks("ab000", 0, 1, expected, actual);
|
||||
assertSameBreaks("abc000", 0, 1, expected, actual);
|
||||
assertSameBreaks("000", 0, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceStart() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000a", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc", 3, 3, expected, actual);
|
||||
assertSameBreaks("000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
public void testSliceMiddle() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000a000", 3, 1, expected, actual);
|
||||
assertSameBreaks("000ab000", 3, 2, expected, actual);
|
||||
assertSameBreaks("000abc000", 3, 3, expected, actual);
|
||||
assertSameBreaks("000000", 3, 0, expected, actual);
|
||||
}
|
||||
|
||||
/** the current position must be ignored, initial position is always first() */
|
||||
public void testFirstPosition() throws Exception {
|
||||
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
|
||||
BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
|
||||
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
|
||||
}
|
||||
|
||||
private static char randomSeparator() {
|
||||
return randomFrom(' ', HighlightUtils.NULL_SEPARATOR, HighlightUtils.PARAGRAPH_SEPARATOR);
|
||||
}
|
||||
|
||||
private static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(new StringCharacterIterator(text),
|
||||
new StringCharacterIterator(text),
|
||||
expected,
|
||||
actual);
|
||||
}
|
||||
|
||||
private static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(text, offset, length, offset, expected, actual);
|
||||
}
|
||||
|
||||
private static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
|
||||
assertSameBreaks(new StringCharacterIterator(text, offset, offset + length, current),
|
||||
new StringCharacterIterator(text, offset, offset + length, current),
|
||||
expected,
|
||||
actual);
|
||||
}
|
||||
|
||||
/** Asserts that two breakiterators break the text the same way */
|
||||
private static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
|
||||
expected.setText(one);
|
||||
actual.setText(two);
|
||||
|
||||
assertEquals(expected.current(), actual.current());
|
||||
|
||||
// next()
|
||||
int v = expected.current();
|
||||
while (v != BreakIterator.DONE) {
|
||||
assertEquals(v = expected.next(), actual.next());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// first()
|
||||
assertEquals(expected.first(), actual.first());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
// last()
|
||||
assertEquals(expected.last(), actual.last());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
|
||||
// previous()
|
||||
v = expected.current();
|
||||
while (v != BreakIterator.DONE) {
|
||||
assertEquals(v = expected.previous(), actual.previous());
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// following()
|
||||
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
|
||||
expected.first();
|
||||
actual.first();
|
||||
assertEquals(expected.following(i), actual.following(i));
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
|
||||
// preceding()
|
||||
for (int i = one.getBeginIndex(); i <= one.getEndIndex(); i++) {
|
||||
expected.last();
|
||||
actual.last();
|
||||
assertEquals(expected.preceding(i), actual.preceding(i));
|
||||
assertEquals(expected.current(), actual.current());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
|
|
@ -41,7 +41,6 @@ import org.apache.lucene.search.Sort;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.Snippet;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.lucene.all.AllTermQuery;
|
||||
|
|
|
@ -56,7 +56,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.CustomHighlighter;
|
|||
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
import org.elasticsearch.search.suggest.CustomSuggesterSearchIT.CustomSuggestionBuilder;
|
||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||
|
@ -204,7 +204,7 @@ public class SearchModuleTests extends ModuleTestCase {
|
|||
Map<String, Highlighter> highlighters = module.getHighlighters();
|
||||
assertEquals(FastVectorHighlighter.class, highlighters.get("fvh").getClass());
|
||||
assertEquals(PlainHighlighter.class, highlighters.get("plain").getClass());
|
||||
assertEquals(PostingsHighlighter.class, highlighters.get("postings").getClass());
|
||||
assertEquals(UnifiedHighlighter.class, highlighters.get("unified").getClass());
|
||||
assertSame(highlighters.get("custom"), customHighlighter);
|
||||
}
|
||||
|
||||
|
|
|
@ -852,7 +852,7 @@ public class TopHitsIT extends ESIntegTestCase {
|
|||
}
|
||||
|
||||
public void testNestedFetchFeatures() {
|
||||
String hlType = randomFrom("plain", "fvh", "postings");
|
||||
String hlType = randomFrom("plain", "fvh", "unified");
|
||||
HighlightBuilder.Field hlField = new HighlightBuilder.Field("comments.message")
|
||||
.highlightQuery(matchQuery("comments.message", "comment"))
|
||||
.forceSource(randomBoolean()) // randomly from stored field or _source
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -312,8 +312,7 @@ disk space and, because it is a combination of other fields, it may result in
|
|||
odd highlighting results.
|
||||
|
||||
The `_all` field also accepts the `term_vector` and `index_options`
|
||||
parameters, allowing the use of the fast vector highlighter and the postings
|
||||
highlighter.
|
||||
parameters, allowing highlighting to use it.
|
||||
|
||||
[[all-highlight-fields]]
|
||||
===== Highlight original fields
|
||||
|
|
|
@ -26,7 +26,7 @@ following settings:
|
|||
|
||||
Doc number, term frequencies, positions, and start and end character
|
||||
offsets (which map the term back to the original string) are indexed.
|
||||
Offsets are used by the <<postings-highlighter,postings highlighter>>.
|
||||
Offsets are used by the <<unified-highlighter,unified highlighter>> to speed up highlighting.
|
||||
|
||||
<<mapping-index,Analyzed>> string fields use `positions` as the default, and
|
||||
all other fields use `docs` as the default.
|
||||
|
@ -67,4 +67,4 @@ GET my_index/_search
|
|||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
<1> The `text` field will use the postings highlighter by default because `offsets` are indexed.
|
||||
<1> The `text` field will use the postings for the highlighting by default because `offsets` are indexed.
|
||||
|
|
|
@ -188,7 +188,7 @@ accessed within the scope of the `nested` query, the
|
|||
|
||||
For instance, if a string field within a nested document has
|
||||
<<index-options,`index_options`>> set to `offsets` to allow use of the postings
|
||||
highlighter, these offsets will not be available during the main highlighting
|
||||
during the highlighting, these offsets will not be available during the main highlighting
|
||||
phase. Instead, highlighting needs to be performed via
|
||||
<<nested-inner-hits,nested inner hits>>.
|
||||
|
||||
|
|
|
@ -98,3 +98,14 @@ but the only reason why it has not been deprecated too is because it is used
|
|||
for the `random_score` function. If you really need access to the id of
|
||||
documents for sorting, aggregations or search scripts, the recommandation is
|
||||
to duplicate the id as a field in the document.
|
||||
|
||||
==== Highlighers
|
||||
|
||||
The `unified` highlighter is the new default choice for highlighter.
|
||||
The offset strategy for each field is picked internally by this highlighter depending on the
|
||||
type of the field (`index_options`).
|
||||
It is still possible to force the highlighter to `fvh` or `plain` types.
|
||||
|
||||
The `postings` highlighter has been removed from Lucene and Elasticsearch.
|
||||
The `unified` highlighter outputs the same highlighting when `index_options` is set
|
||||
to `offsets`.
|
|
@ -1,9 +1,8 @@
|
|||
[[search-request-highlighting]]
|
||||
=== Highlighting
|
||||
|
||||
Allows to highlight search results on one or more fields. The
|
||||
implementation uses either the lucene `plain` highlighter, the
|
||||
fast vector highlighter (`fvh`) or `postings` highlighter.
|
||||
Highlighters allow you to produce highlighted snippets from one or more fields
|
||||
in your search results.
|
||||
The following is an example of the search request body:
|
||||
|
||||
[source,js]
|
||||
|
@ -45,35 +44,48 @@ from versions before 5.0) that match the expression to be highlighted.
|
|||
Note that all other fields will not be highlighted. If you use a custom mapper and want to
|
||||
highlight on a field anyway, you have to provide the field name explicitly.
|
||||
|
||||
[[plain-highlighter]]
|
||||
==== Plain highlighter
|
||||
[[unified-highlighter]]
|
||||
==== Unified Highlighter
|
||||
|
||||
The default choice of highlighter is of type `plain` and uses the Lucene highlighter.
|
||||
It tries hard to reflect the query matching logic in terms of understanding word importance and any word positioning criteria in phrase queries.
|
||||
The unified highlighter (which is used by default if no highlighter type is specified)
|
||||
uses the Lucene Unified Highlighter.
|
||||
This highlighter breaks the text into sentences and scores individual sentences as
|
||||
if they were documents in this corpus, using the BM25 algorithm.
|
||||
It also supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting.
|
||||
|
||||
[WARNING]
|
||||
If you want to highlight a lot of fields in a lot of documents with complex queries this highlighter will not be fast.
|
||||
In its efforts to accurately reflect query logic it creates a tiny in-memory index and re-runs the original query criteria through
|
||||
[float]
|
||||
===== Offsets Strategy
|
||||
|
||||
In order to create meaningful search snippets from the terms being queried,
|
||||
a highlighter needs to know the start and end character offsets of each word
|
||||
in the original text.
|
||||
These offsets can be obtained from:
|
||||
|
||||
* The postings list (fields mapped as "index_options": "offsets").
|
||||
* Term vectors (fields mapped as "term_vectors": "with_positions_offsets").
|
||||
* The original field, by reanalysing the text on-the-fly.
|
||||
|
||||
[float]
|
||||
====== Plain highlighting
|
||||
|
||||
This mode is picked when there is no other alternative.
|
||||
It creates a tiny in-memory index and re-runs the original query criteria through
|
||||
Lucene's query execution planner to get access to low-level match information on the current document.
|
||||
This is repeated for every field and every document that needs highlighting. If this presents a performance issue in your system consider using an alternative highlighter.
|
||||
This is repeated for every field and every document that needs highlighting.
|
||||
|
||||
[[postings-highlighter]]
|
||||
==== Postings highlighter
|
||||
[float]
|
||||
====== Postings
|
||||
|
||||
If `index_options` is set to `offsets` in the mapping the postings highlighter
|
||||
will be used instead of the plain highlighter. The postings highlighter:
|
||||
|
||||
* Is faster since it doesn't require to reanalyze the text to be highlighted:
|
||||
the larger the documents the better the performance gain should be
|
||||
* Requires less disk space than term_vectors, needed for the fast vector
|
||||
highlighter
|
||||
* Breaks the text into sentences and highlights them. Plays really well with
|
||||
natural languages, not as well with fields containing for instance html markup
|
||||
* Treats the document as the whole corpus, and scores individual sentences as
|
||||
if they were documents in this corpus, using the BM25 algorithm
|
||||
If `index_options` is set to `offsets` in the mapping the `unified` highlighter
|
||||
will use this information to highlight documents without re-analyzing the text.
|
||||
It re-runs the original query directly on the postings and extracts the matching offsets
|
||||
directly from the index limiting the collection to the highlighted documents.
|
||||
This mode is faster on large fields since it doesn't require to reanalyze the text to be highlighted
|
||||
and requires less disk space than term_vectors, needed for the fast vector
|
||||
highlighting.
|
||||
|
||||
Here is an example of setting the `comment` field in the index mapping to allow for
|
||||
highlighting using the postings highlighter on it:
|
||||
highlighting using the postings:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -93,24 +105,56 @@ PUT /example
|
|||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
[NOTE]
|
||||
Note that the postings highlighter is meant to perform simple query terms
|
||||
highlighting, regardless of their positions. That means that when used for
|
||||
instance in combination with a phrase query, it will highlight all the terms
|
||||
that the query is composed of, regardless of whether they are actually part of
|
||||
a query match, effectively ignoring their positions.
|
||||
[float]
|
||||
====== Term Vectors
|
||||
|
||||
If `term_vector` information is provided by setting `term_vector` to
|
||||
`with_positions_offsets` in the mapping then the `unified` highlighter
|
||||
will automatically use the `term_vector` to highlight the field.
|
||||
The `term_vector` highlighting is faster to highlight multi-term queries like
|
||||
`prefix` or `wildcard` because it can access the dictionary of term for each document
|
||||
but it is also usually more costly than using the `postings` directly.
|
||||
|
||||
Here is an example of setting the `comment` field to allow for
|
||||
highlighting using the `term_vectors` (this will cause the index to be bigger):
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /example
|
||||
{
|
||||
"mappings": {
|
||||
"doc" : {
|
||||
"properties": {
|
||||
"comment" : {
|
||||
"type": "text",
|
||||
"term_vector" : "with_positions_offsets"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
[[plain-highlighter]]
|
||||
==== Plain highlighter
|
||||
|
||||
This highlighter of type `plain` uses the standard Lucene highlighter.
|
||||
It tries hard to reflect the query matching logic in terms of understanding word importance and any word positioning criteria in phrase queries.
|
||||
|
||||
[WARNING]
|
||||
The postings highlighter doesn't support highlighting some complex queries,
|
||||
like a `match` query with `type` set to `match_phrase_prefix`. No highlighted
|
||||
snippets will be returned in that case.
|
||||
If you want to highlight a lot of fields in a lot of documents with complex queries this highlighter will not be fast.
|
||||
In its efforts to accurately reflect query logic it creates a tiny in-memory index and re-runs the original query criteria through
|
||||
Lucene's query execution planner to get access to low-level match information on the current document.
|
||||
This is repeated for every field and every document that needs highlighting. If this presents a performance issue in your system consider using an alternative highlighter.
|
||||
|
||||
[[fast-vector-highlighter]]
|
||||
==== Fast vector highlighter
|
||||
|
||||
If `term_vector` information is provided by setting `term_vector` to
|
||||
`with_positions_offsets` in the mapping then the fast vector highlighter
|
||||
will be used instead of the plain highlighter. The fast vector highlighter:
|
||||
This highlighter of type `fvh` uses the Lucene Fast Vector highlighter.
|
||||
This highlighter can be used on fields with `term_vector` set to
|
||||
`with_positions_offsets` in the mapping.
|
||||
The fast vector highlighter:
|
||||
|
||||
* Is faster especially for large fields (> `1MB`)
|
||||
* Can be customized with `boundary_scanner` (see <<boundary-scanners,below>>)
|
||||
|
@ -144,30 +188,10 @@ PUT /example
|
|||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
==== Unified Highlighter
|
||||
|
||||
experimental[]
|
||||
|
||||
The `unified` highlighter can extract offsets from either postings, term vectors, or via re-analyzing text.
|
||||
Under the hood it uses Lucene UnifiedHighlighter which picks its strategy depending on the field and the query to highlight.
|
||||
Independently of the strategy this highlighter breaks the text into sentences and scores individual sentences as
|
||||
if they were documents in this corpus, using the BM25 algorithm.
|
||||
It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting and can be used with the following options:
|
||||
|
||||
* `force_source`
|
||||
* `encoder`
|
||||
* `highlight_query`
|
||||
* `pre_tags and `post_tags`
|
||||
* `require_field_match`
|
||||
* `boundary_scanner` (`sentence` (**default**) or `word`)
|
||||
* `max_fragment_length` (only for `sentence` scanner)
|
||||
* `no_match_size`
|
||||
|
||||
==== Force highlighter type
|
||||
|
||||
The `type` field allows to force a specific highlighter type. This is useful
|
||||
for instance when needing to use the plain highlighter on a field that has
|
||||
`term_vectors` enabled. The allowed values are: `plain`, `postings` and `fvh`.
|
||||
The `type` field allows to force a specific highlighter type.
|
||||
The allowed values are: `unified`, `plain` and `fvh`.
|
||||
The following is an example that forces the use of the plain highlighter:
|
||||
|
||||
[source,js]
|
||||
|
@ -320,9 +344,6 @@ GET /_search
|
|||
// CONSOLE
|
||||
// TEST[setup:twitter]
|
||||
|
||||
The `fragment_size` is ignored when using the postings highlighter, as it
|
||||
outputs sentences regardless of their length.
|
||||
|
||||
On top of this it is possible to specify that highlighted fragments need
|
||||
to be sorted by score:
|
||||
|
||||
|
@ -375,10 +396,7 @@ In the case where there is no matching fragment to highlight, the default is
|
|||
to not return anything. Instead, we can return a snippet of text from the
|
||||
beginning of the field by setting `no_match_size` (default `0`) to the length
|
||||
of the text that you want returned. The actual length may be shorter or longer than
|
||||
specified as it tries to break on a word boundary. When using the postings
|
||||
highlighter it is not possible to control the actual size of the snippet,
|
||||
therefore the first sentence gets returned whenever `no_match_size` is
|
||||
greater than `0`.
|
||||
specified as it tries to break on a word boundary.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -403,6 +421,8 @@ GET /_search
|
|||
|
||||
==== Fragmenter
|
||||
|
||||
WARNING: This option is not supported by the `unified` highlighter
|
||||
|
||||
Fragmenter can control how text should be broken up in highlight snippets.
|
||||
However, this option is applicable only for the Plain Highlighter.
|
||||
There are two options:
|
||||
|
@ -421,6 +441,7 @@ GET twitter/tweet/_search
|
|||
"highlight" : {
|
||||
"fields" : {
|
||||
"message" : {
|
||||
"type": "plain",
|
||||
"fragment_size" : 15,
|
||||
"number_of_fragments" : 3,
|
||||
"fragmenter": "simple"
|
||||
|
@ -476,6 +497,7 @@ GET twitter/tweet/_search
|
|||
"highlight" : {
|
||||
"fields" : {
|
||||
"message" : {
|
||||
"type": "plain",
|
||||
"fragment_size" : 15,
|
||||
"number_of_fragments" : 3,
|
||||
"fragmenter": "span"
|
||||
|
@ -596,12 +618,6 @@ GET /_search
|
|||
// CONSOLE
|
||||
// TEST[setup:twitter]
|
||||
|
||||
Note that the score of text fragment in this case is calculated by the Lucene
|
||||
highlighting framework. For implementation details you can check the
|
||||
`ScoreOrderFragmentsBuilder.java` class. On the other hand when using the
|
||||
postings highlighter the fragments are scored using, as mentioned above,
|
||||
the BM25 algorithm.
|
||||
|
||||
[[highlighting-settings]]
|
||||
==== Global Settings
|
||||
|
||||
|
@ -681,6 +697,9 @@ You can set `fragment_size` to 0 to never split any sentence.
|
|||
|
||||
[[matched-fields]]
|
||||
==== Matched Fields
|
||||
|
||||
WARNING: This is only supported by the `fvh` highlighter
|
||||
|
||||
The Fast Vector Highlighter can combine matches on multiple fields to
|
||||
highlight a single field using `matched_fields`. This is most
|
||||
intuitive for multifields that analyze the same string in different
|
||||
|
@ -814,6 +833,9 @@ to
|
|||
|
||||
[[phrase-limit]]
|
||||
==== Phrase Limit
|
||||
|
||||
WARNING: this is only supported by the `fvh` highlighter
|
||||
|
||||
The fast vector highlighter has a `phrase_limit` parameter that prevents
|
||||
it from analyzing too many phrases and eating tons of memory. It defaults
|
||||
to 256 so only the first 256 matching phrases in the document scored
|
||||
|
|
|
@ -136,7 +136,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
|
|||
refresh();
|
||||
SearchResponse search = client().prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||
.highlighter(new HighlightBuilder().field("body")).get();
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).get();
|
||||
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
||||
search = client()
|
||||
.prepareSearch()
|
||||
|
@ -146,7 +146,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
|
|||
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
|
||||
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
|
||||
+ "is a test for highlighting feature"))
|
||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
||||
.highlighter(new HighlightBuilder().field("body").highlighterType("fvh")).execute().actionGet();
|
||||
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
|
||||
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
|
||||
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
||||
|
|
Loading…
Reference in New Issue