LUCENE-7575: Add UnifiedHighlighter field matcher predicate (AKA requireFieldMatch=false)

(cherry picked from commit 2e948fe)
2016-12-05 16:11:57 -05:00 · 2016-12-05 16:11:57 -05:00 · 4e7a7dbf9a
parent cdce621087
commit 4e7a7dbf9a
7 changed files with 519 additions and 84 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -3,6 +3,57 @@ Lucene Change Log
 For more information on past and future Lucene versions, please see:
 http://s.apache.org/luceneversions

+======================= Lucene 7.0.0 =======================
+
+API Changes
+
+* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
+  Use setSplitOnWhitespace(true) to get the old behavior.  (Steve Rowe)
+
+* LUCENE-7369: Similarity.coord and BooleanQuery.disableCoord are removed.
+  (Adrien Grand)
+
+* LUCENE-7368: Removed query normalization. (Adrien Grand)
+
+* LUCENE-7355: AnalyzingQueryParser has been removed as its functionality has
+  been folded into the classic QueryParser. (Adrien Grand)
+
+* LUCENE-7407: Doc values APIs have been switched from random access
+  to iterators, enabling future codec compression improvements. (Mike
+  McCandless)
+
+* LUCENE-7475: Norms now support sparsity, allowing to pay for what is
+  actually used. (Adrien Grand)
+
+* LUCENE-7494: Points now have a per-field API, like doc values. (Adrien Grand)
+
+Bug Fixes
+
+Improvements
+
+* LUCENE-7489: Better storage of sparse doc-values fields with the default
+  codec. (Adrien Grand)
+
+Optimizations
+
+* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
+  in the sets of SHOULD and FILTER clauses, or both in MUST/FILTER and MUST_NOT
+  clauses. (Spyros Kapnissis via Adrien Grand, Uwe Schindler)
+
+* LUCENE-7506: FastTaxonomyFacetCounts should use CPU in proportion to
+  the size of the intersected set of hits from the query and documents
+  that have a facet value, so sparse faceting works as expected
+  (Adrien Grand via Mike McCandless)
+
+* LUCENE-7519: Add optimized APIs to compute browse-only top level
+  facets (Mike McCandless)
+
+Other
+
+* LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
+
+* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
+
 ======================= Lucene 6.4.0 =======================

 API Changes
@ -73,6 +124,11 @@ Improvements
 * LUCENE-7537: Index time sorting now supports multi-valued sorts
  using selectors (MIN, MAX, etc.) (Jim Ferenczi via Mike McCandless)

+* LUCENE-7575: UnifiedHighlighter can now highlight fields with queries that don't
+  necessarily refer to that field (AKA requireFieldMatch==false). Disabled by default.
+  See UH get/setFieldMatcher. (Jim Ferenczi via David Smiley)
+
+
 Optimizations

 * LUCENE-7568: Optimize merging when index sorting is used but the
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@ -23,6 +23,7 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.function.Function;
+import java.util.function.Predicate;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.FilteringTokenFilter;
@ -49,7 +50,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
  private final LeafReader leafReader;
  private final CharacterRunAutomaton preMemIndexFilterAutomaton;

-  public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
+  public MemoryIndexOffsetStrategy(String field, Predicate<String> fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
                                   CharacterRunAutomaton[] automata, Analyzer analyzer,
                                   Function<Query, Collection<Query>> multiTermQueryRewrite) {
    super(field, extractedTerms, phraseHelper, automata, analyzer);
@ -57,13 +58,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
    memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
    leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
    // preFilter for MemoryIndex
-    preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
+    preMemIndexFilterAutomaton = buildCombinedAutomaton(fieldMatcher, terms, this.automata, phraseHelper, multiTermQueryRewrite);
  }

  /**
   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
   */
-  private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
+  private static CharacterRunAutomaton buildCombinedAutomaton(Predicate<String> fieldMatcher,
+                                                              BytesRef[] terms,
                                                              CharacterRunAutomaton[] automata,
                                                              PhraseHelper strictPhrases,
                                                              Function<Query, Collection<Query>> multiTermQueryRewrite) {
@ -74,7 +76,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
    Collections.addAll(allAutomata, automata);
    for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
      Collections.addAll(allAutomata,
-          MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+          MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));//true==lookInSpan
    }

    if (allAutomata.size() == 1) {
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@ -22,6 +22,7 @@ import java.util.Collection;
 import java.util.Comparator;
 import java.util.List;
 import java.util.function.Function;
+import java.util.function.Predicate;

 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.AutomatonQuery;
@ -56,50 +57,52 @@ class MultiTermHighlighting {
  }

  /**
-   * Extracts all MultiTermQueries for {@code field}, and returns equivalent
-   * automata that will match terms.
+   * Extracts MultiTermQueries that match the provided field predicate.
+   * Returns equivalent automata that will match terms.
   */
-  public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan,
+  public static CharacterRunAutomaton[] extractAutomata(Query query,
+                                                        Predicate<String> fieldMatcher,
+                                                        boolean lookInSpan,
                                                        Function<Query, Collection<Query>> preRewriteFunc) {
    List<CharacterRunAutomaton> list = new ArrayList<>();
    Collection<Query> customSubQueries = preRewriteFunc.apply(query);
    if (customSubQueries != null) {
      for (Query sub : customSubQueries) {
-        list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
+        list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
      }
    } else if (query instanceof BooleanQuery) {
      for (BooleanClause clause : (BooleanQuery) query) {
        if (!clause.isProhibited()) {
-          list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc)));
+          list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
        }
      }
    } else if (query instanceof ConstantScoreQuery) {
-      list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan,
+      list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan,
          preRewriteFunc)));
    } else if (query instanceof DisjunctionMaxQuery) {
      for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
-        list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
+        list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
      }
    } else if (lookInSpan && query instanceof SpanOrQuery) {
      for (Query sub : ((SpanOrQuery) query).getClauses()) {
-        list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
+        list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
      }
    } else if (lookInSpan && query instanceof SpanNearQuery) {
      for (Query sub : ((SpanNearQuery) query).getClauses()) {
-        list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
+        list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
      }
    } else if (lookInSpan && query instanceof SpanNotQuery) {
-      list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan,
+      list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan,
          preRewriteFunc)));
    } else if (lookInSpan && query instanceof SpanPositionCheckQuery) {
-      list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan,
+      list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan,
          preRewriteFunc)));
    } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
-      list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field,
-          lookInSpan, preRewriteFunc)));
+      list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
+          fieldMatcher, lookInSpan, preRewriteFunc)));
    } else if (query instanceof AutomatonQuery) {
      final AutomatonQuery aq = (AutomatonQuery) query;
-      if (aq.getField().equals(field)) {
+      if (fieldMatcher.test(aq.getField())) {
        list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
          @Override
          public String toString() {
@ -110,7 +113,7 @@ class MultiTermHighlighting {
    } else if (query instanceof PrefixQuery) {
      final PrefixQuery pq = (PrefixQuery) query;
      Term prefix = pq.getPrefix();
-      if (prefix.field().equals(field)) {
+      if (fieldMatcher.test(prefix.field())) {
        list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
            Automata.makeAnyString())) {
          @Override
@ -121,7 +124,7 @@ class MultiTermHighlighting {
      }
    } else if (query instanceof FuzzyQuery) {
      final FuzzyQuery fq = (FuzzyQuery) query;
-      if (fq.getField().equals(field)) {
+      if (fieldMatcher.test(fq.getField())) {
        String utf16 = fq.getTerm().text();
        int termText[] = new int[utf16.codePointCount(0, utf16.length())];
        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
@ -142,7 +145,7 @@ class MultiTermHighlighting {
      }
    } else if (query instanceof TermRangeQuery) {
      final TermRangeQuery tq = (TermRangeQuery) query;
-      if (tq.getField().equals(field)) {
+      if (fieldMatcher.test(tq.getField())) {
        final CharsRef lowerBound;
        if (tq.getLowerTerm() == null) {
          lowerBound = null;
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
@ -16,17 +16,50 @@
 */
 package org.apache.lucene.search.uhighlight;

-import org.apache.lucene.index.*;
-import org.apache.lucene.search.*;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Function;
+import java.util.function.Predicate;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.search.highlight.WeightedSpanTerm;
 import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
-import org.apache.lucene.search.spans.*;
+import org.apache.lucene.search.spans.SpanCollector;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanWeight;
+import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.util.BytesRef;

-import java.io.IOException;
-import java.util.*;
-import java.util.function.Function;
-
 /**
 * Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly).
 * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
@ -40,7 +73,7 @@ import java.util.function.Function;
 public class PhraseHelper {

  public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
-      spanQuery -> null, query -> null, true);
+      (s) -> false, spanQuery -> null, query -> null, true);

  //TODO it seems this ought to be a general thing on Spans?
  private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> {
@ -59,10 +92,11 @@ public class PhraseHelper {
    }
  };

-  private final String fieldName; // if non-null, only look at queries/terms for this field
+  private final String fieldName;
  private final Set<Term> positionInsensitiveTerms; // (TermQuery terms)
  private final Set<SpanQuery> spanQueries;
  private final boolean willRewrite;
+  private final Predicate<String> fieldMatcher;

  /**
   * Constructor.
@ -73,14 +107,15 @@ public class PhraseHelper {
   * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked.
   * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
   * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
+   * {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted.
   */
-  public PhraseHelper(Query query, String field, Function<SpanQuery, Boolean> rewriteQueryPred,
+  public PhraseHelper(Query query, String field, Predicate<String> fieldMatcher, Function<SpanQuery, Boolean> rewriteQueryPred,
                      Function<Query, Collection<Query>> preExtractRewriteFunction,
                      boolean ignoreQueriesNeedingRewrite) {
-    this.fieldName = field; // if null then don't require field match
+    this.fieldName = field;
+    this.fieldMatcher = fieldMatcher;
    // filter terms to those we want
-    positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>();
-    // requireFieldMatch optional
+    positionInsensitiveTerms = new FieldFilteringTermSet();
    spanQueries = new HashSet<>();

    // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
@ -131,11 +166,11 @@ public class PhraseHelper {
      @Override
      protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
                                              float boost) throws IOException {
-        if (field != null) {
-          // if this span query isn't for this field, skip it.
-          Set<String> fieldNameSet = new HashSet<>();//TODO reuse.  note: almost always size 1
-          collectSpanQueryFields(spanQuery, fieldNameSet);
-          if (!fieldNameSet.contains(field)) {
+        // if this span query isn't for this field, skip it.
+        Set<String> fieldNameSet = new HashSet<>();//TODO reuse.  note: almost always size 1
+        collectSpanQueryFields(spanQuery, fieldNameSet);
+        for (String spanField : fieldNameSet) {
+          if (!fieldMatcher.test(spanField)) {
            return;
          }
        }
@ -190,10 +225,11 @@ public class PhraseHelper {
    if (spanQueries.isEmpty()) {
      return Collections.emptyMap();
    }
+    final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName);
    // for each SpanQuery, collect the member spans into a map.
    Map<BytesRef, Spans> result = new HashMap<>();
    for (SpanQuery spanQuery : spanQueries) {
-      getTermToSpans(spanQuery, leafReader.getContext(), doc, result);
+      getTermToSpans(spanQuery, filteredReader.getContext(), doc, result);
    }
    return result;
  }
@ -203,15 +239,14 @@ public class PhraseHelper {
                              int doc, Map<BytesRef, Spans> result)
      throws IOException {
    // note: in WSTE there was some field specific looping that seemed pointless so that isn't here.
-    final IndexSearcher searcher = new IndexSearcher(readerContext);
+    final IndexSearcher searcher = new IndexSearcher(readerContext.reader());
    searcher.setQueryCache(null);
    if (willRewrite) {
      spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done
    }

    // Get the underlying query terms
-
-    TreeSet<Term> termSet = new TreeSet<>(); // sorted so we can loop over results in order shortly...
+    TreeSet<Term> termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly...
    searcher.createWeight(spanQuery, false).extractTerms(termSet);//needsScores==false

    // Get Spans by running the query against the reader
@ -240,9 +275,6 @@ public class PhraseHelper {
    for (final Term queryTerm : termSet) {
      // note: we expect that at least one query term will pass these filters. This is because the collected
      //   spanQuery list were already filtered by these conditions.
-      if (fieldName != null && fieldName.equals(queryTerm.field()) == false) {
-        continue;
-      }
      if (positionInsensitiveTerms.contains(queryTerm)) {
        continue;
      }
@ -375,19 +407,17 @@ public class PhraseHelper {
  }

  /**
-   * Simple HashSet that filters out Terms not matching a desired field on {@code add()}.
+   * Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}.
   */
-  private static class FieldFilteringTermHashSet extends HashSet<Term> {
-    private final String field;
-
-    FieldFilteringTermHashSet(String field) {
-      this.field = field;
-    }
-
+  private class FieldFilteringTermSet extends TreeSet<Term> {
    @Override
    public boolean add(Term term) {
-      if (term.field().equals(field)) {
-        return super.add(term);
+      if (fieldMatcher.test(term.field())) {
+        if (term.field().equals(fieldName)) {
+          return super.add(term);
+        } else {
+          return super.add(new Term(fieldName, term.bytes()));
+        }
      } else {
        return false;
      }
@ -499,6 +529,64 @@ public class PhraseHelper {
    }
  }

+  /**
+   * This reader will just delegate every call to a single field in the wrapped
+   * LeafReader. This way we ensure that all queries going through this reader target the same field.
+  */
+  static final class SingleFieldFilterLeafReader extends FilterLeafReader {
+    final String fieldName;
+    SingleFieldFilterLeafReader(LeafReader in, String fieldName) {
+      super(in);
+      this.fieldName = fieldName;
+    }
+
+    @Override
+    public FieldInfos getFieldInfos() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Fields fields() throws IOException {
+      return new FilterFields(super.fields()) {
+        @Override
+        public Terms terms(String field) throws IOException {
+          return super.terms(fieldName);
+        }
+
+        @Override
+        public Iterator<String> iterator() {
+          return Collections.singletonList(fieldName).iterator();
+        }
+
+        @Override
+        public int size() {
+          return 1;
+        }
+      };
+    }
+
+    @Override
+    public NumericDocValues getNumericDocValues(String field) throws IOException {
+      return super.getNumericDocValues(fieldName);
+    }
+
+    @Override
+    public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+      return super.getBinaryDocValues(fieldName);
+    }
+
+    @Override
+    public SortedDocValues getSortedDocValues(String field) throws IOException {
+      return super.getSortedDocValues(fieldName);
+    }
+
+    @Override
+    public NumericDocValues getNormValues(String field) throws IOException {
+      return super.getNormValues(fieldName);
+    }
+  }
+
+
  /**
   * A Spans based on a list of cached spans for one doc.  It is pre-positioned to this doc.
   */
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@ -24,6 +24,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.EnumSet;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@ -31,6 +32,7 @@ import java.util.Objects;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.function.Predicate;
 import java.util.function.Supplier;

 import org.apache.lucene.analysis.Analyzer;
@ -58,7 +60,6 @@ import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.InPlaceMergeSorter;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
@ -119,13 +120,13 @@ public class UnifiedHighlighter {

  private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy

-  // private boolean defaultRequireFieldMatch = true; TODO
-
  private int maxLength = DEFAULT_MAX_LENGTH;

  // BreakIterator is stateful so we use a Supplier factory method
  private Supplier<BreakIterator> defaultBreakIterator = () -> BreakIterator.getSentenceInstance(Locale.ROOT);

+  private Predicate<String> defaultFieldMatcher;
+
  private PassageScorer defaultScorer = new PassageScorer();

  private PassageFormatter defaultFormatter = new DefaultPassageFormatter();
@ -140,8 +141,8 @@ public class UnifiedHighlighter {
  /**
   * Calls {@link Weight#extractTerms(Set)} on an empty index for the query.
   */
-  protected static SortedSet<Term> extractTerms(Query query) throws IOException {
-    SortedSet<Term> queryTerms = new TreeSet<>();
+  protected static Set<Term> extractTerms(Query query) throws IOException {
+    Set<Term> queryTerms = new HashSet<>();
    EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
    return queryTerms;
  }
@ -197,6 +198,10 @@ public class UnifiedHighlighter {
    this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
  }

+  public void setFieldMatcher(Predicate<String> predicate) {
+    this.defaultFieldMatcher = predicate;
+  }
+
  /**
   * Returns whether {@link MultiTermQuery} derivatives will be highlighted.  By default it's enabled.  MTQ
   * highlighting can be expensive, particularly when using offsets in postings.
@ -220,6 +225,18 @@ public class UnifiedHighlighter {
    return defaultPassageRelevancyOverSpeed;
  }

+  /**
+   * Returns the predicate to use for extracting the query part that must be highlighted.
+   * By default only queries that target the current field are kept. (AKA requireFieldMatch)
+   */
+  protected Predicate<String> getFieldMatcher(String field) {
+    if (defaultFieldMatcher != null) {
+      return defaultFieldMatcher;
+    } else {
+      // requireFieldMatch = true
+      return (qf) -> field.equals(qf);
+    }
+  }

  /**
   * The maximum content size to process.  Content will be truncated to this size before highlighting. Typically
@ -548,7 +565,7 @@ public class UnifiedHighlighter {
    copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params

    // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
-    SortedSet<Term> queryTerms = extractTerms(query);
+    Set<Term> queryTerms = extractTerms(query);
    FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
    int numTermVectors = 0;
    int numPostings = 0;
@ -718,13 +735,13 @@ public class UnifiedHighlighter {
          getClass().getSimpleName() + " without an IndexSearcher.");
    }
    Objects.requireNonNull(content, "content is required");
-    SortedSet<Term> queryTerms = extractTerms(query);
+    Set<Term> queryTerms = extractTerms(query);
    return getFieldHighlighter(field, query, queryTerms, maxPassages)
        .highlightFieldForDoc(null, -1, content);
  }

-  protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
-    BytesRef[] terms = filterExtractedTerms(field, allTerms);
+  protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
+    BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
    Set<HighlightFlag> highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
@ -738,19 +755,15 @@ public class UnifiedHighlighter {
        getFormatter(field));
  }

-  protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
-    // TODO consider requireFieldMatch
-    Term floor = new Term(field, "");
-    Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
-    SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
-
-    // Strip off the redundant field:
-    BytesRef[] terms = new BytesRef[fieldTerms.size()];
-    int termUpto = 0;
-    for (Term term : fieldTerms) {
-      terms[termUpto++] = term.bytes();
+  protected static BytesRef[] filterExtractedTerms(Predicate<String> fieldMatcher, Set<Term> queryTerms) {
+    // Strip off the redundant field and sort the remaining terms
+    SortedSet<BytesRef> filteredTerms = new TreeSet<>();
+    for (Term term : queryTerms) {
+      if (fieldMatcher.test(term.field())) {
+        filteredTerms.add(term.bytes());
+      }
    }
-    return terms;
+    return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
  }

  protected Set<HighlightFlag> getFlags(String field) {
@ -771,14 +784,13 @@ public class UnifiedHighlighter {
    boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
    boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
    return highlightPhrasesStrictly ?
-        new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) :
-        PhraseHelper.NONE;
+        new PhraseHelper(query, field, getFieldMatcher(field),
+            this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE;
  }

  protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
    return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
-        ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
-          this::preMultiTermQueryRewrite)
+        ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), !highlightFlags.contains(HighlightFlag.PHRASES), this::preMultiTermQueryRewrite)
        : ZERO_LEN_AUTOMATA_ARRAY;
  }

@ -826,7 +838,7 @@ public class UnifiedHighlighter {
          //skip using a memory index since it's pure term filtering
          return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
        } else {
-          return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
+          return new MemoryIndexOffsetStrategy(field, getFieldMatcher(field), terms, phraseHelper, automata, getIndexAnalyzer(),
              this::preMultiTermQueryRewrite);
        }
      case NONE_NEEDED:
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@ -25,6 +25,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.function.Predicate;

 import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
 import org.apache.lucene.analysis.MockAnalyzer;
@ -32,14 +33,17 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
@ -959,4 +963,275 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
    ir.close();
  }

+  private IndexReader indexSomeFields() throws IOException {
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
+    FieldType ft = new FieldType();
+    ft.setIndexOptions(IndexOptions.NONE);
+    ft.setTokenized(false);
+    ft.setStored(true);
+    ft.freeze();
+
+    Field title = new Field("title", "", fieldType);
+    Field text = new Field("text", "", fieldType);
+    Field category = new Field("category", "", fieldType);
+
+    Document doc = new Document();
+    doc.add(title);
+    doc.add(text);
+    doc.add(category);
+    title.setStringValue("This is the title field.");
+    text.setStringValue("This is the text field. You can put some text if you want.");
+    category.setStringValue("This is the category field.");
+    iw.addDocument(doc);
+
+    IndexReader ir = iw.getReader();
+    iw.close();
+    return ir;
+  }
+
+  public void testFieldMatcherTermQuery() throws Exception {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
+      @Override
+      protected Predicate<String> getFieldMatcher(String field) {
+        // requireFieldMatch=false
+        return (qf) -> true;
+      }
+    };
+    UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
+    BooleanQuery.Builder queryBuilder =
+        new BooleanQuery.Builder()
+            .add(new TermQuery(new Term("text", "some")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "field")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "this")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "this")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("category", "this")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("category", "some")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("category", "category")), BooleanClause.Occur.SHOULD);
+    Query query = queryBuilder.build();
+
+    // title
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // text
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // category
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
+
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+    ir.close();
+  }
+
+  public void testFieldMatcherMultiTermQuery() throws Exception {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
+      @Override
+      protected Predicate<String> getFieldMatcher(String field) {
+        // requireFieldMatch=false
+        return (qf) -> true;
+      }
+    };
+    UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
+    BooleanQuery.Builder queryBuilder =
+        new BooleanQuery.Builder()
+            .add(new FuzzyQuery(new Term("text", "sime"), 1), BooleanClause.Occur.SHOULD)
+            .add(new PrefixQuery(new Term("text", "fie")), BooleanClause.Occur.SHOULD)
+            .add(new PrefixQuery(new Term("text", "thi")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
+            .add(new PrefixQuery(new Term("title", "thi")), BooleanClause.Occur.SHOULD)
+            .add(new PrefixQuery(new Term("category", "thi")), BooleanClause.Occur.SHOULD)
+            .add(new FuzzyQuery(new Term("category", "sime"), 1), BooleanClause.Occur.SHOULD)
+            .add(new PrefixQuery(new Term("category", "categ")), BooleanClause.Occur.SHOULD);
+    Query query = queryBuilder.build();
+
+    // title
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // text
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // category
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
+
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+    ir.close();
+  }
+
+  public void testFieldMatcherPhraseQuery() throws Exception {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
+      @Override
+      protected Predicate<String> getFieldMatcher(String field) {
+        // requireFieldMatch=false
+        return (qf) -> true;
+      }
+    };
+    UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
+    BooleanQuery.Builder queryBuilder =
+        new BooleanQuery.Builder()
+            .add(new PhraseQuery("title", "this", "is", "the", "title"), BooleanClause.Occur.SHOULD)
+            .add(new PhraseQuery(2, "category", "this", "is", "the", "field"), BooleanClause.Occur.SHOULD)
+            .add(new PhraseQuery("text", "this", "is"), BooleanClause.Occur.SHOULD)
+            .add(new PhraseQuery("category", "this", "is"), BooleanClause.Occur.SHOULD)
+            .add(new PhraseQuery(1, "text", "you", "can", "put", "text"), BooleanClause.Occur.SHOULD);
+    Query query = queryBuilder.build();
+
+    // title
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> field.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
+      snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // text
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>text</b> <b>field</b>. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the <b>text</b> field. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
+      snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("This is the text field. You can put some text if you want.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+
+    // category
+    {
+      TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+      assertEquals(1, topDocs.totalHits);
+      String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
+
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
+
+
+      highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
+      snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
+      assertEquals(1, snippets.length);
+      assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
+      highlighterFieldMatch.setFieldMatcher(null);
+    }
+    ir.close();
+  }
 }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@ -23,7 +23,6 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.SortedSet;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
@ -144,7 +143,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
      }

      @Override
-      protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
+      protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
        return super.getFieldHighlighter(field, query, allTerms, maxPassages);
      }