UnifiedHighlighter highlight on multiple fields (#13268)

Add ability to UnifiedHighlighter to combine matches from multiple fields
to highlight a single field.

FastVectorHighlighter for a long time has an option to highlight a single field
based on matches from several fields. But UnifiedHighlighter was missing this option.

This adds this ability with a new function: `UnifiedHighlighter::withMaskedFieldsFunc` 
that sets up a function that given a field retuns a set of masked fields whose matches 
are combined  to highlight the given field.
This commit is contained in:
Mayya Sharipova 2024-04-12 06:36:25 -04:00 committed by GitHub
parent e19238a7bd
commit 0345fcabb3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 241 additions and 3 deletions

View File

@ -249,6 +249,10 @@ New Features
* GITHUB#13197: Expand support for new scalar bit levels for HNSW vectors. This includes 4-bit vectors and an option
to compress them to gain a 50% reduction in memory usage. (Ben Trent)
* GITHUB#13268: Add ability for UnifiedHighlighter to highlight a field based on combined matches from multiple fields.
(Mayya Sharipova, Jim Ferenczi)
Improvements
---------------------

View File

@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReader;
/**
* FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
* based on matches from multiple fields.
*
* @lucene.internal
*/
public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
private final List<FieldOffsetStrategy> fieldsOffsetStrategies;
public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
super(null);
this.fieldsOffsetStrategies = fieldsOffsetStrategies;
}
@Override
public String getField() {
throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
// TODO: what should be returned here as offset source?
return fieldsOffsetStrategies.getFirst().getOffsetSource();
}
@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
throws IOException {
List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
if (offsetsEnum != OffsetsEnum.EMPTY) {
fieldsOffsetsEnums.add(offsetsEnum);
}
}
return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
}
}

View File

@ -31,6 +31,7 @@ import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
@ -122,6 +123,8 @@ public class UnifiedHighlighter {
private Predicate<String> fieldMatcher;
private final Function<String, Set<String>> maskedFieldsFunc;
private Set<HighlightFlag> flags;
// e.g. wildcards
@ -162,6 +165,7 @@ public class UnifiedHighlighter {
Objects.requireNonNull(
indexAnalyzer,
"indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
this.maskedFieldsFunc = null;
}
@Deprecated
@ -256,6 +260,8 @@ public class UnifiedHighlighter {
private final Analyzer indexAnalyzer;
private Predicate<String> fieldMatcher;
private Function<String, Set<String>> maskedFieldsFunc;
private Set<HighlightFlag> flags;
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
@ -360,6 +366,22 @@ public class UnifiedHighlighter {
return this;
}
/**
* Set up a function that given a field retuns a set of masked fields whose matches are combined
* to highlight the given field. Masked fields should not include the original field. This is
* useful when you want to highlight a field based on matches from several fields.
*
* <p>Note: All masked fields must share the same source as the field being highlighted,
* otherwise their offsets will not correspond to the highlighted field.
*
* <p>Note: Only the field being highlighted must provide an original source value (e.g. through
* stored field), other masked fields don't need it.
*/
public Builder withMaskedFieldsFunc(Function<String, Set<String>> maskedFieldsFunc) {
this.maskedFieldsFunc = maskedFieldsFunc;
return this;
}
public Builder withScorer(PassageScorer value) {
this.scorer = value;
return this;
@ -436,6 +458,7 @@ public class UnifiedHighlighter {
this.maxLength = builder.maxLength;
this.breakIterator = builder.breakIterator;
this.fieldMatcher = builder.fieldMatcher;
this.maskedFieldsFunc = builder.maskedFieldsFunc;
this.scorer = builder.scorer;
this.formatter = builder.formatter;
this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
@ -543,6 +566,10 @@ public class UnifiedHighlighter {
}
}
protected Set<String> getMaskedFields(String field) {
return maskedFieldsFunc == null ? null : maskedFieldsFunc.apply(field);
}
/** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
protected Set<HighlightFlag> getFlags(String field) {
// If a builder is used for initializing a UH object, then flags will never be null.
@ -1065,11 +1092,29 @@ public class UnifiedHighlighter {
protected FieldHighlighter getFieldHighlighter(
String field, Query query, Set<Term> allTerms, int maxPassages) {
Set<String> maskedFields = getMaskedFields(field);
FieldOffsetStrategy fieldOffsetStrategy;
if (maskedFields == null || maskedFields.isEmpty()) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
} else {
List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(maskedFields.size() + 1);
for (String maskedField : maskedFields) {
UHComponents components = getHighlightComponents(maskedField, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
}
// adding original field as well
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
}
return newFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
fieldOffsetStrategy,
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,

View File

@ -30,13 +30,20 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -54,8 +61,13 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
@ParametersFactory
@ -1337,6 +1349,121 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
ir.close();
}
public void testMaskedFields() throws IOException {
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new WhitespaceAnalyzer());
fieldAnalyzers.put("field_english", new EnglishAnalyzer()); // English stemming and stopwords
fieldAnalyzers.put( // Each letter is a token
"field_characters",
new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put( // Every three letters is a token
"field_tripples",
new MockAnalyzer(
random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
Analyzer analyzer =
new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
FieldType fieldTypeMatched = new FieldType(fieldType);
fieldTypeMatched.setStored(false); // matched fields don't need to be stored
fieldTypeMatched.freeze();
try (Directory dir = newDirectory()) {
try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer))) {
Document doc = new Document();
doc.add(new Field("field", "dance with star", fieldType));
doc.add(new Field("field_english", "dance with star", fieldTypeMatched));
doc.add(new Field("field_characters", "dance with star", fieldTypeMatched));
doc.add(new Field("field_tripples", "dance with star", fieldTypeMatched));
writer.addDocument(doc);
}
try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = newSearcher(reader);
// field is highlighted based on the matches from the "field_english"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_english"),
"dancing with the stars",
"<b>dance with star</b>",
"<b>dance</b> with <b>star</b>");
// field is highlighted based on the matches from the "field_characters"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>d</b><b>a</b><b>n</b><b>c</b>e with star");
// field is highlighted based on the matches from the "field_tripples"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_tripples"),
"danc",
"<b>dan</b>ce with star",
"<b>dan</b>ce with star");
// field is highlighted based on the matches from the "field_characters" and
// "field_tripples"
maskedFieldsTestCase(
analyzer,
searcher,
"field",
Set.of("field_tripples", "field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>da</b><b>n</b><b>c</b>e with star");
}
}
}
private static void maskedFieldsTestCase(
Analyzer analyzer,
IndexSearcher searcher,
String field,
Set<String> maskedFields,
String queryText,
String expectedSnippetWithWeightMatches,
String expectedSnippetWithoutWeightMatches)
throws IOException {
QueryBuilder queryBuilder = new QueryBuilder(analyzer);
BooleanQuery.Builder boolQueryBuilder = new BooleanQuery.Builder();
Query fieldPhraseQuery = queryBuilder.createPhraseQuery(field, queryText, 2);
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
for (String maskedField : maskedFields) {
fieldPhraseQuery = queryBuilder.createPhraseQuery(maskedField, queryText, 2);
boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
}
Query query = boolQueryBuilder.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits.value);
Function<String, Set<String>> maskedFieldsFunc =
fieldName -> fieldName.equals(field) ? maskedFields : Collections.emptySet();
UnifiedHighlighter.Builder uhBuilder =
new UnifiedHighlighter.Builder(searcher, analyzer).withMaskedFieldsFunc(maskedFieldsFunc);
UnifiedHighlighter highlighter =
randomUnifiedHighlighter(
uhBuilder, EnumSet.of(HighlightFlag.PHRASES), random().nextBoolean());
String[] snippets = highlighter.highlight(field, query, topDocs, 10);
String expectedSnippet =
highlighter.getFlags(field).contains(HighlightFlag.WEIGHT_MATCHES)
? expectedSnippetWithWeightMatches
: expectedSnippetWithoutWeightMatches;
assertEquals(1, snippets.length);
assertEquals(expectedSnippet, snippets[0]);
}
public void testMatchesSlopBug() throws IOException {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);