mirror of https://github.com/apache/lucene.git
UnifiedHighlighter highlight on multiple fields (#13268)
Add ability to UnifiedHighlighter to combine matches from multiple fields to highlight a single field. FastVectorHighlighter for a long time has an option to highlight a single field based on matches from several fields. But UnifiedHighlighter was missing this option. This adds this ability with a new function: `UnifiedHighlighter::withMaskedFieldsFunc` that sets up a function that given a field retuns a set of masked fields whose matches are combined to highlight the given field.
This commit is contained in:
parent
e19238a7bd
commit
0345fcabb3
|
@ -249,6 +249,10 @@ New Features
|
|||
* GITHUB#13197: Expand support for new scalar bit levels for HNSW vectors. This includes 4-bit vectors and an option
|
||||
to compress them to gain a 50% reduction in memory usage. (Ben Trent)
|
||||
|
||||
* GITHUB#13268: Add ability for UnifiedHighlighter to highlight a field based on combined matches from multiple fields.
|
||||
(Mayya Sharipova, Jim Ferenczi)
|
||||
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
|
||||
/**
|
||||
* FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
|
||||
* based on matches from multiple fields.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
|
||||
private final List<FieldOffsetStrategy> fieldsOffsetStrategies;
|
||||
|
||||
public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
|
||||
super(null);
|
||||
this.fieldsOffsetStrategies = fieldsOffsetStrategies;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField() {
|
||||
throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
// TODO: what should be returned here as offset source?
|
||||
return fieldsOffsetStrategies.getFirst().getOffsetSource();
|
||||
}
|
||||
|
||||
@Override
|
||||
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
|
||||
throws IOException {
|
||||
List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
|
||||
for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
|
||||
OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
|
||||
if (offsetsEnum != OffsetsEnum.EMPTY) {
|
||||
fieldsOffsetsEnums.add(offsetsEnum);
|
||||
}
|
||||
}
|
||||
return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
|
||||
}
|
||||
}
|
|
@ -31,6 +31,7 @@ import java.util.Objects;
|
|||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -122,6 +123,8 @@ public class UnifiedHighlighter {
|
|||
|
||||
private Predicate<String> fieldMatcher;
|
||||
|
||||
private final Function<String, Set<String>> maskedFieldsFunc;
|
||||
|
||||
private Set<HighlightFlag> flags;
|
||||
|
||||
// e.g. wildcards
|
||||
|
@ -162,6 +165,7 @@ public class UnifiedHighlighter {
|
|||
Objects.requireNonNull(
|
||||
indexAnalyzer,
|
||||
"indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
|
||||
this.maskedFieldsFunc = null;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
|
@ -256,6 +260,8 @@ public class UnifiedHighlighter {
|
|||
|
||||
private final Analyzer indexAnalyzer;
|
||||
private Predicate<String> fieldMatcher;
|
||||
|
||||
private Function<String, Set<String>> maskedFieldsFunc;
|
||||
private Set<HighlightFlag> flags;
|
||||
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
|
||||
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
|
||||
|
@ -360,6 +366,22 @@ public class UnifiedHighlighter {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up a function that given a field retuns a set of masked fields whose matches are combined
|
||||
* to highlight the given field. Masked fields should not include the original field. This is
|
||||
* useful when you want to highlight a field based on matches from several fields.
|
||||
*
|
||||
* <p>Note: All masked fields must share the same source as the field being highlighted,
|
||||
* otherwise their offsets will not correspond to the highlighted field.
|
||||
*
|
||||
* <p>Note: Only the field being highlighted must provide an original source value (e.g. through
|
||||
* stored field), other masked fields don't need it.
|
||||
*/
|
||||
public Builder withMaskedFieldsFunc(Function<String, Set<String>> maskedFieldsFunc) {
|
||||
this.maskedFieldsFunc = maskedFieldsFunc;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withScorer(PassageScorer value) {
|
||||
this.scorer = value;
|
||||
return this;
|
||||
|
@ -436,6 +458,7 @@ public class UnifiedHighlighter {
|
|||
this.maxLength = builder.maxLength;
|
||||
this.breakIterator = builder.breakIterator;
|
||||
this.fieldMatcher = builder.fieldMatcher;
|
||||
this.maskedFieldsFunc = builder.maskedFieldsFunc;
|
||||
this.scorer = builder.scorer;
|
||||
this.formatter = builder.formatter;
|
||||
this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
|
||||
|
@ -543,6 +566,10 @@ public class UnifiedHighlighter {
|
|||
}
|
||||
}
|
||||
|
||||
protected Set<String> getMaskedFields(String field) {
|
||||
return maskedFieldsFunc == null ? null : maskedFieldsFunc.apply(field);
|
||||
}
|
||||
|
||||
/** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
|
||||
protected Set<HighlightFlag> getFlags(String field) {
|
||||
// If a builder is used for initializing a UH object, then flags will never be null.
|
||||
|
@ -1065,11 +1092,29 @@ public class UnifiedHighlighter {
|
|||
|
||||
protected FieldHighlighter getFieldHighlighter(
|
||||
String field, Query query, Set<Term> allTerms, int maxPassages) {
|
||||
Set<String> maskedFields = getMaskedFields(field);
|
||||
FieldOffsetStrategy fieldOffsetStrategy;
|
||||
if (maskedFields == null || maskedFields.isEmpty()) {
|
||||
UHComponents components = getHighlightComponents(field, query, allTerms);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(components);
|
||||
fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
|
||||
} else {
|
||||
List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(maskedFields.size() + 1);
|
||||
for (String maskedField : maskedFields) {
|
||||
UHComponents components = getHighlightComponents(maskedField, query, allTerms);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(components);
|
||||
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
|
||||
}
|
||||
// adding original field as well
|
||||
UHComponents components = getHighlightComponents(field, query, allTerms);
|
||||
OffsetSource offsetSource = getOptimizedOffsetSource(components);
|
||||
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
|
||||
|
||||
fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
|
||||
}
|
||||
return newFieldHighlighter(
|
||||
field,
|
||||
getOffsetStrategy(offsetSource, components),
|
||||
fieldOffsetStrategy,
|
||||
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
|
||||
getScorer(field),
|
||||
maxPassages,
|
||||
|
|
|
@ -30,13 +30,20 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
|
@ -54,8 +61,13 @@ import org.apache.lucene.search.TermQuery;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
import org.apache.lucene.util.QueryBuilder;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
|
||||
@ParametersFactory
|
||||
|
@ -1337,6 +1349,121 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
|
|||
ir.close();
|
||||
}
|
||||
|
||||
public void testMaskedFields() throws IOException {
|
||||
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
|
||||
fieldAnalyzers.put("field", new WhitespaceAnalyzer());
|
||||
fieldAnalyzers.put("field_english", new EnglishAnalyzer()); // English stemming and stopwords
|
||||
fieldAnalyzers.put( // Each letter is a token
|
||||
"field_characters",
|
||||
new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
|
||||
fieldAnalyzers.put( // Every three letters is a token
|
||||
"field_tripples",
|
||||
new MockAnalyzer(
|
||||
random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
|
||||
Analyzer analyzer =
|
||||
new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
|
||||
@Override
|
||||
public Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return fieldAnalyzers.get(fieldName);
|
||||
}
|
||||
};
|
||||
FieldType fieldTypeMatched = new FieldType(fieldType);
|
||||
fieldTypeMatched.setStored(false); // matched fields don't need to be stored
|
||||
fieldTypeMatched.freeze();
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer))) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "dance with star", fieldType));
|
||||
doc.add(new Field("field_english", "dance with star", fieldTypeMatched));
|
||||
doc.add(new Field("field_characters", "dance with star", fieldTypeMatched));
|
||||
doc.add(new Field("field_tripples", "dance with star", fieldTypeMatched));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
try (IndexReader reader = DirectoryReader.open(dir)) {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
// field is highlighted based on the matches from the "field_english"
|
||||
maskedFieldsTestCase(
|
||||
analyzer,
|
||||
searcher,
|
||||
"field",
|
||||
Set.of("field_english"),
|
||||
"dancing with the stars",
|
||||
"<b>dance with star</b>",
|
||||
"<b>dance</b> with <b>star</b>");
|
||||
|
||||
// field is highlighted based on the matches from the "field_characters"
|
||||
maskedFieldsTestCase(
|
||||
analyzer,
|
||||
searcher,
|
||||
"field",
|
||||
Set.of("field_characters"),
|
||||
"danc",
|
||||
"<b>danc</b>e with star",
|
||||
"<b>d</b><b>a</b><b>n</b><b>c</b>e with star");
|
||||
|
||||
// field is highlighted based on the matches from the "field_tripples"
|
||||
maskedFieldsTestCase(
|
||||
analyzer,
|
||||
searcher,
|
||||
"field",
|
||||
Set.of("field_tripples"),
|
||||
"danc",
|
||||
"<b>dan</b>ce with star",
|
||||
"<b>dan</b>ce with star");
|
||||
|
||||
// field is highlighted based on the matches from the "field_characters" and
|
||||
// "field_tripples"
|
||||
maskedFieldsTestCase(
|
||||
analyzer,
|
||||
searcher,
|
||||
"field",
|
||||
Set.of("field_tripples", "field_characters"),
|
||||
"danc",
|
||||
"<b>danc</b>e with star",
|
||||
"<b>da</b><b>n</b><b>c</b>e with star");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void maskedFieldsTestCase(
|
||||
Analyzer analyzer,
|
||||
IndexSearcher searcher,
|
||||
String field,
|
||||
Set<String> maskedFields,
|
||||
String queryText,
|
||||
String expectedSnippetWithWeightMatches,
|
||||
String expectedSnippetWithoutWeightMatches)
|
||||
throws IOException {
|
||||
QueryBuilder queryBuilder = new QueryBuilder(analyzer);
|
||||
BooleanQuery.Builder boolQueryBuilder = new BooleanQuery.Builder();
|
||||
Query fieldPhraseQuery = queryBuilder.createPhraseQuery(field, queryText, 2);
|
||||
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
|
||||
for (String maskedField : maskedFields) {
|
||||
fieldPhraseQuery = queryBuilder.createPhraseQuery(maskedField, queryText, 2);
|
||||
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
Query query = boolQueryBuilder.build();
|
||||
TopDocs topDocs = searcher.search(query, 10);
|
||||
assertEquals(1, topDocs.totalHits.value);
|
||||
|
||||
Function<String, Set<String>> maskedFieldsFunc =
|
||||
fieldName -> fieldName.equals(field) ? maskedFields : Collections.emptySet();
|
||||
UnifiedHighlighter.Builder uhBuilder =
|
||||
new UnifiedHighlighter.Builder(searcher, analyzer).withMaskedFieldsFunc(maskedFieldsFunc);
|
||||
UnifiedHighlighter highlighter =
|
||||
randomUnifiedHighlighter(
|
||||
uhBuilder, EnumSet.of(HighlightFlag.PHRASES), random().nextBoolean());
|
||||
String[] snippets = highlighter.highlight(field, query, topDocs, 10);
|
||||
String expectedSnippet =
|
||||
highlighter.getFlags(field).contains(HighlightFlag.WEIGHT_MATCHES)
|
||||
? expectedSnippetWithWeightMatches
|
||||
: expectedSnippetWithoutWeightMatches;
|
||||
assertEquals(1, snippets.length);
|
||||
assertEquals(expectedSnippet, snippets[0]);
|
||||
}
|
||||
|
||||
public void testMatchesSlopBug() throws IOException {
|
||||
IndexReader ir = indexSomeFields();
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
|
|
Loading…
Reference in New Issue