Integrate UnifiedHighlighter (#21621)

* Integrate UnifiedHighlighter

This change integrates the Lucene highlighter called "unified" in the list of supported highlighters for ES.
This highlighter can extract offsets from either postings, term vectors, or via re-analyzing text.
The best strategy is picked automatically at query time and depends on the field and the query to highlight.
This commit is contained in:
Jim Ferenczi 2017-01-31 19:06:03 +01:00 committed by GitHub
parent f90051e6e0
commit f6d38d480a
18 changed files with 1445 additions and 432 deletions

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.apache.lucene.search.postingshighlight;
package org.apache.lucene.search.highlight;
/**
* Represents a scored highlighted snippet.

View File

@ -19,6 +19,7 @@
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.Encoder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
@ -46,10 +47,10 @@ public class CustomPassageFormatter extends PassageFormatter {
for (int j = 0; j < passages.length; j++) {
Passage passage = passages[j];
StringBuilder sb = new StringBuilder();
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
@ -62,7 +63,7 @@ public class CustomPassageFormatter extends PassageFormatter {
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.endOffset));
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
sb.deleteCharAt(sb.length() - 1);
@ -70,7 +71,7 @@ public class CustomPassageFormatter extends PassageFormatter {
sb.deleteCharAt(sb.length() - 1);
}
//and we trim the snippets too
snippets[j] = new Snippet(sb.toString().trim(), passage.score, passage.numMatches > 0);
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
}
return snippets;
}

View File

@ -22,6 +22,7 @@ package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Snippet;
import java.io.IOException;
import java.text.BreakIterator;

View File

@ -0,0 +1,82 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Snippet;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
/**
* Custom passage formatter that allows us to:
* 1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet})
* 2) use the {@link Encoder} implementations that are already used with the other highlighters
*/
public class CustomPassageFormatter extends PassageFormatter {
private final String preTag;
private final String postTag;
private final Encoder encoder;
public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) {
this.preTag = preTag;
this.postTag = postTag;
this.encoder = encoder;
}
@Override
public Snippet[] format(Passage[] passages, String content) {
Snippet[] snippets = new Snippet[passages.length];
int pos;
for (int j = 0; j < passages.length; j++) {
Passage passage = passages[j];
StringBuilder sb = new StringBuilder();
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
sb.deleteCharAt(sb.length() - 1);
} else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
sb.deleteCharAt(sb.length() - 1);
}
//and we trim the snippets too
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
}
return snippets;
}
private void append(StringBuilder dest, String content, int start, int end) {
dest.append(encoder.encodeText(content.substring(start, end)));
}
}

View File

@ -0,0 +1,204 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
/**
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
* Uses a custom {@link PassageFormatter}. Accepts field content as a constructor
* argument, given that loadings field value can be done reading from _source field.
* Supports using different {@link BreakIterator} to break the text into fragments. Considers every distinct field
* value as a discrete passage for highlighting (unless the whole content needs to be highlighted).
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
*/
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
private final String fieldValue;
private final PassageFormatter passageFormatter;
private final BreakIterator breakIterator;
private final boolean returnNonHighlightedSnippets;
/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
* the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
* returned rather than empty snippets when no highlighting can be performed
*/
public CustomUnifiedHighlighter(IndexSearcher searcher,
Analyzer analyzer,
PassageFormatter passageFormatter,
@Nullable BreakIterator breakIterator,
String fieldValue,
boolean returnNonHighlightedSnippets) {
super(searcher, analyzer);
this.breakIterator = breakIterator;
this.passageFormatter = passageFormatter;
this.fieldValue = fieldValue;
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
}
/**
* Highlights terms extracted from the provided query within the content of the provided field name
*/
public Snippet[] highlightField(String field, Query query, int docId, int maxPassages) throws IOException {
Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query,
new int[]{docId}, new int[]{maxPassages});
Object[] snippetObjects = fieldsAsObjects.get(field);
if (snippetObjects != null) {
//one single document at a time
assert snippetObjects.length == 1;
Object snippetObject = snippetObjects[0];
if (snippetObject != null && snippetObject instanceof Snippet[]) {
return (Snippet[]) snippetObject;
}
}
return EMPTY_SNIPPET;
}
@Override
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
int cacheCharsThreshold) throws IOException {
//we only highlight one field, one document at a time
return Collections.singletonList(new String[]{fieldValue});
}
@Override
protected BreakIterator getBreakIterator(String field) {
if (breakIterator != null) {
return breakIterator;
}
return super.getBreakIterator(field);
}
@Override
protected PassageFormatter getFormatter(String field) {
return passageFormatter;
}
@Override
protected int getMaxNoHighlightPassages(String field) {
if (returnNonHighlightedSnippets) {
return 1;
}
return 0;
}
@Override
protected Collection<Query> preMultiTermQueryRewrite(Query query) {
return rewriteCustomQuery(query);
}
@Override
protected Collection<Query> preSpanQueryRewrite(Query query) {
return rewriteCustomQuery(query);
}
/**
* Translate custom queries in queries that are supported by the unified highlighter.
*/
private Collection<Query> rewriteCustomQuery(Query query) {
if (query instanceof MultiPhrasePrefixQuery) {
MultiPhrasePrefixQuery mpq = (MultiPhrasePrefixQuery) query;
Term[][] terms = mpq.getTerms();
int[] positions = mpq.getPositions();
SpanQuery[] positionSpanQueries = new SpanQuery[positions.length];
int sizeMinus1 = terms.length - 1;
for (int i = 0; i < positions.length; i++) {
SpanQuery[] innerQueries = new SpanQuery[terms[i].length];
for (int j = 0; j < terms[i].length; j++) {
if (i == sizeMinus1) {
innerQueries[j] = new SpanMultiTermQueryWrapper(new PrefixQuery(terms[i][j]));
} else {
innerQueries[j] = new SpanTermQuery(terms[i][j]);
}
}
if (innerQueries.length > 1) {
positionSpanQueries[i] = new SpanOrQuery(innerQueries);
} else {
positionSpanQueries[i] = innerQueries[0];
}
}
// sum position increments beyond 1
int positionGaps = 0;
if (positions.length >= 2) {
// positions are in increasing order. max(0,...) is just a safeguard.
positionGaps = Math.max(0, positions[positions.length - 1] - positions[0] - positions.length + 1);
}
//if original slop is 0 then require inOrder
boolean inorder = (mpq.getSlop() == 0);
return Collections.singletonList(new SpanNearQuery(positionSpanQueries,
mpq.getSlop() + positionGaps, inorder));
} else if (query instanceof CommonTermsQuery) {
CommonTermsQuery ctq = (CommonTermsQuery) query;
List<Query> tqs = new ArrayList<> ();
for (Term term : ctq.getTerms()) {
tqs.add(new TermQuery(term));
}
return tqs;
} else if (query instanceof AllTermQuery) {
AllTermQuery atq = (AllTermQuery) query;
return Collections.singletonList(new TermQuery(atq.getTerm()));
} else if (query instanceof FunctionScoreQuery) {
return Collections.singletonList(((FunctionScoreQuery) query).getSubQuery());
} else if (query instanceof FiltersFunctionScoreQuery) {
return Collections.singletonList(((FiltersFunctionScoreQuery) query).getSubQuery());
} else {
return null;
}
}
}

View File

@ -32,7 +32,6 @@ import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
@ -87,21 +86,18 @@ public final class AllTermQuery extends Query {
if (rewritten != this) {
return rewritten;
}
boolean fieldExists = false;
boolean hasPayloads = false;
for (LeafReaderContext context : reader.leaves()) {
final Terms terms = context.reader().terms(term.field());
if (terms != null) {
fieldExists = true;
if (terms.hasPayloads()) {
hasPayloads = true;
break;
}
}
}
if (fieldExists == false) {
return new MatchNoDocsQuery();
}
// if the terms does not exist we could return a MatchNoDocsQuery but this would break the unified highlighter
// which rewrites query with an empty reader.
if (hasPayloads == false) {
return new TermQuery(term);
}

View File

@ -25,6 +25,8 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.Query;
@ -115,6 +117,20 @@ public class MultiPhrasePrefixQuery extends Query {
positions.add(position);
}
/**
* Returns the terms for each position in this phrase
*/
public Term[][] getTerms() {
Term[][] terms = new Term[termArrays.size()][];
for (int i = 0; i < termArrays.size(); i++) {
terms[i] = new Term[termArrays.get(i).length];
for (int j = 0; j < termArrays.get(i).length; j++) {
terms[i][j] = termArrays.get(i)[j];
}
}
return terms;
}
/**
* Returns the relative positions of terms in this phrase.
*/
@ -150,7 +166,12 @@ public class MultiPhrasePrefixQuery extends Query {
}
}
if (terms.isEmpty()) {
return Queries.newMatchNoDocsQuery("No terms supplied for " + MultiPhrasePrefixQuery.class.getName());
// if the terms does not exist we could return a MatchNoDocsQuery but this would break the unified highlighter
// which rewrites query with an empty reader.
return new BooleanQuery.Builder()
.add(query.build(), BooleanClause.Occur.MUST)
.add(Queries.newMatchNoDocsQuery("No terms supplied for " + MultiPhrasePrefixQuery.class.getName()),
BooleanClause.Occur.MUST).build();
}
query.add(terms.toArray(Term.class), position);
return query.build();

View File

@ -37,7 +37,6 @@ import org.elasticsearch.common.lucene.Lucene;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

View File

@ -238,6 +238,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase;
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
import org.elasticsearch.search.rescore.QueryRescorerBuilder;
import org.elasticsearch.search.rescore.RescoreBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
@ -599,7 +600,7 @@ public class SearchModule {
highlighters.register("fvh", new FastVectorHighlighter(settings));
highlighters.register("plain", new PlainHighlighter());
highlighters.register("postings", new PostingsHighlighter());
highlighters.register("unified", new UnifiedHighlighter());
highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
return unmodifiableMap(highlighters.getRegistry());

View File

@ -25,7 +25,7 @@ import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.postingshighlight.CustomPassageFormatter;
import org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter;
import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.postingshighlight.Snippet;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.text.Text;
@ -139,14 +139,14 @@ public class PostingsHighlighter implements Highlighter {
return fieldMapper.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
}
private static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
//postings highlighter accepts all values in a single string, as offsets etc. need to match with content
//loaded from stored fields, we merge all values using a proper separator
String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
}
private static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
//We need to filter the snippets as due to no_match_size we could have
//either highlighted snippets or non highlighted ones and we don't want to mix those up
@ -181,11 +181,11 @@ public class PostingsHighlighter implements Highlighter {
return filteredSnippets;
}
private static class HighlighterEntry {
static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
}
private static class MapperHighlighterEntry {
static class MapperHighlighterEntry {
final CustomPassageFormatter passageFormatter;
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {

View File

@ -0,0 +1,158 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
public class UnifiedHighlighter implements Highlighter {
private static final String CACHE_KEY = "highlight-unified";
@Override
public boolean canHighlight(FieldMapper fieldMapper) {
return true;
}
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ?
HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
CustomPassageFormatter passageFormatter =
new CustomPassageFormatter(field.fieldOptions().preTags()[0],
field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer =
context.mapperService().documentMapper(hitContext.hit().type()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
fieldValues = fieldValues.stream().map(obj -> {
if (obj instanceof BytesRef) {
return fieldMapper.fieldType().valueForDisplay(obj).toString();
} else {
return obj;
}
}).collect(Collectors.toList());
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
CustomUnifiedHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
// we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
new org.apache.lucene.search.postingshighlight
.CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
highlighter =
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = field.fieldOptions().numberOfFragments();
}
if (field.fieldOptions().requireFieldMatch()) {
final String fieldName = highlighterContext.fieldName;
highlighter.setFieldMatcher((name) -> fieldName.equals(name));
} else {
highlighter.setFieldMatcher((name) -> true);
}
Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName,
highlighterContext.query, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch (IOException e) {
throw new FetchPhaseExecutionException(context,
"Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
}
static class MapperHighlighterEntry {
final CustomPassageFormatter passageFormatter;
private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
this.passageFormatter = passageFormatter;
}
}
}

View File

@ -19,6 +19,7 @@
package org.apache.lucene.search.postingshighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.util.BytesRef;

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;

View File

@ -0,0 +1,105 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.test.ESTestCase;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.notNullValue;
public class CustomPassageFormatterTests extends ESTestCase {
public void testSimpleFormat() {
String content = "This is a really cool highlighter. Unified highlighter gives nice snippets back. No matches here.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
Passage[] passages = new Passage[3];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
passage1.setEndOffset(end + 2); //lets include the whitespace at the end to make sure we trim it
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(end + 26);
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Passage passage3 = new Passage();
passage3.setStartOffset(passage2.getEndOffset());
passage3.setEndOffset(content.length());
passages[2] = passage3;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(3));
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
assertThat(fragments[0].isHighlighted(), equalTo(true));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
assertThat(fragments[1].isHighlighted(), equalTo(true));
assertThat(fragments[2].getText(), equalTo("No matches here."));
assertThat(fragments[2].isHighlighted(), equalTo(false));
}
public void testHtmlEncodeFormat() {
String content = "<b>This is a really cool highlighter.</b> Unified highlighter gives nice snippets back.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
Passage[] passages = new Passage[2];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
passage1.setEndOffset(end + 6); //lets include the whitespace at the end to make sure we trim it
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(content.length());
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(2));
assertThat(fragments[0].getText(), equalTo("&lt;b&gt;This is a really cool <em>highlighter</em>.&lt;&#x2F;b&gt;"));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
}
}

View File

@ -0,0 +1,259 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Snippet;
import org.apache.lucene.store.Directory;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import static org.hamcrest.CoreMatchers.equalTo;
public class CustomUnifiedHighlighterTests extends ESTestCase {
public void testCustomUnifiedHighlighter() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
offsetsType.setStoreTermVectorOffsets(true);
offsetsType.setStoreTermVectorPositions(true);
offsetsType.setStoreTermVectors(true);
//good position but only one match
final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter.";
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue(firstValue);
//two matches, not the best snippet due to its length though
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " +
"that gets scored lower.";
Field body2 = new Field("body", "", offsetsType);
doc.add(body2);
body2.setStringValue(secondValue);
//two matches and short, will be scored highest
final String thirdValue = "This is highlighting the third short highlighting value.";
Field body3 = new Field("body", "", offsetsType);
doc.add(body3);
body3.setStringValue(thirdValue);
//one match, same as first but at the end, will be scored lower due to its position
final String fourthValue = "Just a test4 highlighting from unified highlighter.";
Field body4 = new Field("body", "", offsetsType);
doc.add(body4);
body4.setStringValue(fourthValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
String firstHlValue = "Just a test1 <b>highlighting</b> from unified highlighter.";
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
" longer text that gets scored lower.";
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
String fourthHlValue = "Just a test4 <b>highlighting</b> from unified highlighter.";
IndexSearcher searcher = newSearcher(ir);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue +
HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(),
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), null, fieldValue, true);
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(4));
assertThat(snippets[0].getText(), equalTo(firstHlValue));
assertThat(snippets[1].getText(), equalTo(secondHlValue));
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
ir.close();
dir.close();
}
public void testNoMatchSize() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
offsetsType.setStoreTermVectorOffsets(true);
offsetsType.setStoreTermVectorPositions(true);
offsetsType.setStoreTermVectors(true);
Field body = new Field("body", "", offsetsType);
Field none = new Field("none", "", offsetsType);
Document doc = new Document();
doc.add(body);
doc.add(none);
String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore.";
body.setStringValue(firstValue);
none.setStringValue(firstValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
Query query = new TermQuery(new Term("none", "highlighting"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter,
null, firstValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(0));
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true);
snippets = highlighter.highlightField("body", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("This is a test."));
ir.close();
dir.close();
}
private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field textField = new Field(field, "", ft);
Document doc = new Document();
doc.add(textField);
textField.setStringValue(value);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
return ir;
}
public void testMultiPhrasePrefixQuery() throws Exception {
Analyzer analyzer = new StandardAnalyzer();
Directory dir = newDirectory();
String value = "The quick brown fox.";
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
query.add(new Term("text", "quick"));
query.add(new Term("text", "brown"));
query.add(new Term("text", "fo"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
ir.close();
dir.close();
}
public void testAllTermQuery() throws IOException {
Directory dir = newDirectory();
String value = "The quick brown fox.";
Analyzer analyzer = new StandardAnalyzer();
IndexReader ir = indexOneDoc(dir, "all", value, analyzer);
AllTermQuery query = new AllTermQuery(new Term("all", "fox"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("all", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The quick brown <b>fox</b>."));
ir.close();
dir.close();
}
public void testCommonTermsQuery() throws IOException {
Directory dir = newDirectory();
String value = "The quick brown fox.";
Analyzer analyzer = new StandardAnalyzer();
IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
query.add(new Term("text", "quick"));
query.add(new Term("text", "brown"));
query.add(new Term("text", "fox"));
IndexSearcher searcher = newSearcher(ir);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
passageFormatter, null, value, false);
Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
assertThat(snippets.length, equalTo(1));
assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
ir.close();
dir.close();
}
}

View File

@ -126,6 +126,22 @@ the index to be bigger):
}
--------------------------------------------------
==== Unified Highlighter
experimental[]
The `unified` highlighter can extract offsets from either postings, term vectors, or via re-analyzing text.
Under the hood it uses Lucene UnifiedHighlighter which picks its strategy depending on the field and the query to highlight.
Independently of the strategy this highlighter breaks the text into sentences and scores individual sentences as
if they were documents in this corpus, using the BM25 algorithm.
It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting and can be used with the following options:
* `force_source`
* `encoder`
* `highlight_query`
* `pre_tags and `post_tags`
* `require_field_match`
==== Force highlighter type
The `type` field allows to force a specific highlighter type. This is useful

View File

@ -0,0 +1,39 @@
setup:
- do:
indices.create:
index: test
body:
mappings:
unified:
"properties":
"text":
"type": "text"
"fields":
"fvh":
"type": "text"
"term_vector": "with_positions_offsets"
"postings":
"type": "text"
"index_options": "offsets"
- do:
index:
index: test
type: unified
id: 1
body:
"text" : "The quick brown fox is brown."
- do:
indices.refresh: {}
---
"Basic":
- skip:
version: " - 5.2.99"
reason: this uses a new highlighter that has been added in 5.3
- do:
search:
body: { "query" : {"multi_match" : { "query" : "quick brown fox", "fields" : [ "text*"] } }, "highlight" : { "type" : "unified", "fields" : { "*" : {} } } }
- match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
- match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
- match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}