UnifiedHighlighter highlight on multiple fields (#13268)

Add ability to UnifiedHighlighter to combine matches from multiple fields to highlight a single field. FastVectorHighlighter for a long time has an option to highlight a single field based on matches from several fields. But UnifiedHighlighter was missing this option. This adds this ability with a new function: `UnifiedHighlighter::withMaskedFieldsFunc` that sets up a function that given a field retuns a set of masked fields whose matches are combined to highlight the given field.
2024-04-12 06:36:25 -04:00 · 2024-04-12 06:36:25 -04:00 · 0345fcabb3
parent e19238a7bd
commit 0345fcabb3
4 changed files with 241 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -249,6 +249,10 @@ New Features
 * GITHUB#13197: Expand support for new scalar bit levels for HNSW vectors. This includes 4-bit vectors and an option
  to compress them to gain a 50% reduction in memory usage. (Ben Trent)

+* GITHUB#13268: Add ability for UnifiedHighlighter to highlight a field based on combined matches from multiple fields.
+  (Mayya Sharipova, Jim Ferenczi)
+
+
 Improvements
 ---------------------

--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiFieldsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiFieldsOffsetStrategy.java
@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.index.LeafReader;
+
+/**
+ * FieldOffsetStrategy that combines offsets from multiple fields. Used to highlight a single field
+ * based on matches from multiple fields.
+ *
+ * @lucene.internal
+ */
+public class MultiFieldsOffsetStrategy extends FieldOffsetStrategy {
+  private final List<FieldOffsetStrategy> fieldsOffsetStrategies;
+
+  public MultiFieldsOffsetStrategy(List<FieldOffsetStrategy> fieldsOffsetStrategies) {
+    super(null);
+    this.fieldsOffsetStrategies = fieldsOffsetStrategies;
+  }
+
+  @Override
+  public String getField() {
+    throw new IllegalStateException("MultiFieldsOffsetStrategy does not have a single field.");
+  }
+
+  @Override
+  public UnifiedHighlighter.OffsetSource getOffsetSource() {
+    // TODO: what should be returned here as offset source?
+    return fieldsOffsetStrategies.getFirst().getOffsetSource();
+  }
+
+  @Override
+  public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
+      throws IOException {
+    List<OffsetsEnum> fieldsOffsetsEnums = new ArrayList<>(fieldsOffsetStrategies.size());
+    for (FieldOffsetStrategy fieldOffsetStrategy : fieldsOffsetStrategies) {
+      OffsetsEnum offsetsEnum = fieldOffsetStrategy.getOffsetsEnum(reader, docId, content);
+      if (offsetsEnum != OffsetsEnum.EMPTY) {
+        fieldsOffsetsEnums.add(offsetsEnum);
+      }
+    }
+    return new OffsetsEnum.MultiOffsetsEnum(fieldsOffsetsEnums);
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@ -31,6 +31,7 @@ import java.util.Objects;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.function.Function;
 import java.util.function.Predicate;
 import java.util.function.Supplier;
 import org.apache.lucene.analysis.Analyzer;
@ -122,6 +123,8 @@ public class UnifiedHighlighter {

  private Predicate<String> fieldMatcher;

+  private final Function<String, Set<String>> maskedFieldsFunc;
+
  private Set<HighlightFlag> flags;

  // e.g. wildcards
@ -162,6 +165,7 @@ public class UnifiedHighlighter {
        Objects.requireNonNull(
            indexAnalyzer,
            "indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
+    this.maskedFieldsFunc = null;
  }

  @Deprecated
@ -256,6 +260,8 @@ public class UnifiedHighlighter {

    private final Analyzer indexAnalyzer;
    private Predicate<String> fieldMatcher;
+
+    private Function<String, Set<String>> maskedFieldsFunc;
    private Set<HighlightFlag> flags;
    private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
    private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
@ -360,6 +366,22 @@ public class UnifiedHighlighter {
      return this;
    }

+    /**
+     * Set up a function that given a field retuns a set of masked fields whose matches are combined
+     * to highlight the given field. Masked fields should not include the original field. This is
+     * useful when you want to highlight a field based on matches from several fields.
+     *
+     * <p>Note: All masked fields must share the same source as the field being highlighted,
+     * otherwise their offsets will not correspond to the highlighted field.
+     *
+     * <p>Note: Only the field being highlighted must provide an original source value (e.g. through
+     * stored field), other masked fields don't need it.
+     */
+    public Builder withMaskedFieldsFunc(Function<String, Set<String>> maskedFieldsFunc) {
+      this.maskedFieldsFunc = maskedFieldsFunc;
+      return this;
+    }
+
    public Builder withScorer(PassageScorer value) {
      this.scorer = value;
      return this;
@ -436,6 +458,7 @@ public class UnifiedHighlighter {
    this.maxLength = builder.maxLength;
    this.breakIterator = builder.breakIterator;
    this.fieldMatcher = builder.fieldMatcher;
+    this.maskedFieldsFunc = builder.maskedFieldsFunc;
    this.scorer = builder.scorer;
    this.formatter = builder.formatter;
    this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
@ -543,6 +566,10 @@ public class UnifiedHighlighter {
    }
  }

+  protected Set<String> getMaskedFields(String field) {
+    return maskedFieldsFunc == null ? null : maskedFieldsFunc.apply(field);
+  }
+
  /** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
  protected Set<HighlightFlag> getFlags(String field) {
    // If a builder is used for initializing a UH object, then flags will never be null.
@ -1065,11 +1092,29 @@ public class UnifiedHighlighter {

  protected FieldHighlighter getFieldHighlighter(
      String field, Query query, Set<Term> allTerms, int maxPassages) {
+    Set<String> maskedFields = getMaskedFields(field);
+    FieldOffsetStrategy fieldOffsetStrategy;
+    if (maskedFields == null || maskedFields.isEmpty()) {
      UHComponents components = getHighlightComponents(field, query, allTerms);
      OffsetSource offsetSource = getOptimizedOffsetSource(components);
+      fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
+    } else {
+      List<FieldOffsetStrategy> fieldsOffsetStrategies = new ArrayList<>(maskedFields.size() + 1);
+      for (String maskedField : maskedFields) {
+        UHComponents components = getHighlightComponents(maskedField, query, allTerms);
+        OffsetSource offsetSource = getOptimizedOffsetSource(components);
+        fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
+      }
+      // adding original field as well
+      UHComponents components = getHighlightComponents(field, query, allTerms);
+      OffsetSource offsetSource = getOptimizedOffsetSource(components);
+      fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
+
+      fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
+    }
    return newFieldHighlighter(
        field,
-        getOffsetStrategy(offsetSource, components),
+        fieldOffsetStrategy,
        new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
        getScorer(field),
        maxPassages,
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@ -30,13 +30,20 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.TreeMap;
+import java.util.function.Function;
 import java.util.function.Predicate;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
@ -54,8 +61,13 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.util.QueryBuilder;
 import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;

 public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
  @ParametersFactory
@ -1337,6 +1349,121 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
    ir.close();
  }

+  public void testMaskedFields() throws IOException {
+    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
+    fieldAnalyzers.put("field", new WhitespaceAnalyzer());
+    fieldAnalyzers.put("field_english", new EnglishAnalyzer()); // English stemming and stopwords
+    fieldAnalyzers.put( // Each letter is a token
+        "field_characters",
+        new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
+    fieldAnalyzers.put( // Every three letters is a token
+        "field_tripples",
+        new MockAnalyzer(
+            random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
+    Analyzer analyzer =
+        new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
+          @Override
+          public Analyzer getWrappedAnalyzer(String fieldName) {
+            return fieldAnalyzers.get(fieldName);
+          }
+        };
+    FieldType fieldTypeMatched = new FieldType(fieldType);
+    fieldTypeMatched.setStored(false); // matched fields don't need to be stored
+    fieldTypeMatched.freeze();
+
+    try (Directory dir = newDirectory()) {
+      try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer))) {
+        Document doc = new Document();
+        doc.add(new Field("field", "dance with star", fieldType));
+        doc.add(new Field("field_english", "dance with star", fieldTypeMatched));
+        doc.add(new Field("field_characters", "dance with star", fieldTypeMatched));
+        doc.add(new Field("field_tripples", "dance with star", fieldTypeMatched));
+        writer.addDocument(doc);
+      }
+
+      try (IndexReader reader = DirectoryReader.open(dir)) {
+        IndexSearcher searcher = newSearcher(reader);
+        // field is highlighted based on the matches from the "field_english"
+        maskedFieldsTestCase(
+            analyzer,
+            searcher,
+            "field",
+            Set.of("field_english"),
+            "dancing with the stars",
+            "<b>dance with star</b>",
+            "<b>dance</b> with <b>star</b>");
+
+        // field is highlighted based on the matches from the "field_characters"
+        maskedFieldsTestCase(
+            analyzer,
+            searcher,
+            "field",
+            Set.of("field_characters"),
+            "danc",
+            "<b>danc</b>e with star",
+            "<b>d</b><b>a</b><b>n</b><b>c</b>e with star");
+
+        // field is highlighted based on the matches from the "field_tripples"
+        maskedFieldsTestCase(
+            analyzer,
+            searcher,
+            "field",
+            Set.of("field_tripples"),
+            "danc",
+            "<b>dan</b>ce with star",
+            "<b>dan</b>ce with star");
+
+        // field is highlighted based on the matches from the "field_characters" and
+        // "field_tripples"
+        maskedFieldsTestCase(
+            analyzer,
+            searcher,
+            "field",
+            Set.of("field_tripples", "field_characters"),
+            "danc",
+            "<b>danc</b>e with star",
+            "<b>da</b><b>n</b><b>c</b>e with star");
+      }
+    }
+  }
+
+  private static void maskedFieldsTestCase(
+      Analyzer analyzer,
+      IndexSearcher searcher,
+      String field,
+      Set<String> maskedFields,
+      String queryText,
+      String expectedSnippetWithWeightMatches,
+      String expectedSnippetWithoutWeightMatches)
+      throws IOException {
+    QueryBuilder queryBuilder = new QueryBuilder(analyzer);
+    BooleanQuery.Builder boolQueryBuilder = new BooleanQuery.Builder();
+    Query fieldPhraseQuery = queryBuilder.createPhraseQuery(field, queryText, 2);
+    boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
+    for (String maskedField : maskedFields) {
+      fieldPhraseQuery = queryBuilder.createPhraseQuery(maskedField, queryText, 2);
+      boolQueryBuilder.add(fieldPhraseQuery, BooleanClause.Occur.SHOULD);
+    }
+    Query query = boolQueryBuilder.build();
+    TopDocs topDocs = searcher.search(query, 10);
+    assertEquals(1, topDocs.totalHits.value);
+
+    Function<String, Set<String>> maskedFieldsFunc =
+        fieldName -> fieldName.equals(field) ? maskedFields : Collections.emptySet();
+    UnifiedHighlighter.Builder uhBuilder =
+        new UnifiedHighlighter.Builder(searcher, analyzer).withMaskedFieldsFunc(maskedFieldsFunc);
+    UnifiedHighlighter highlighter =
+        randomUnifiedHighlighter(
+            uhBuilder, EnumSet.of(HighlightFlag.PHRASES), random().nextBoolean());
+    String[] snippets = highlighter.highlight(field, query, topDocs, 10);
+    String expectedSnippet =
+        highlighter.getFlags(field).contains(HighlightFlag.WEIGHT_MATCHES)
+            ? expectedSnippetWithWeightMatches
+            : expectedSnippetWithoutWeightMatches;
+    assertEquals(1, snippets.length);
+    assertEquals(expectedSnippet, snippets[0]);
+  }
+
  public void testMatchesSlopBug() throws IOException {
    IndexReader ir = indexSomeFields();
    IndexSearcher searcher = newSearcher(ir);