From bed694ec8811c67b8ba4b4c8943e60eda281850a Mon Sep 17 00:00:00 2001
From: Alan Woodward <romseygeek@apache.org>
Date: Wed, 27 Nov 2019 16:28:19 +0000
Subject: [PATCH] LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037)

This commit adds a consumeTermsMatching() method to QueryVisitor, allowing
queries that match against a class of terms to report this back to the visitor. It also
changes highlighting code to use this new method, replacing the current implementation
via instanceof checks.
---
 lucene/CHANGES.txt                            |  4 +
 .../apache/lucene/search/AutomatonQuery.java  |  4 +-
 .../org/apache/lucene/search/FuzzyQuery.java  | 13 ++-
 .../apache/lucene/search/QueryVisitor.java    | 15 ++-
 .../util/automaton/ByteRunAutomaton.java      |  1 -
 .../util/automaton/CompiledAutomaton.java     | 26 +++++-
 .../search/uhighlight/CharArrayMatcher.java   | 51 +++++++++++
 .../uhighlight/FieldOffsetStrategy.java       | 13 ++-
 .../uhighlight/LabelledCharArrayMatcher.java  | 88 ++++++++++++++++++
 .../uhighlight/MemoryIndexOffsetStrategy.java | 32 +++----
 .../uhighlight/MultiTermHighlighting.java     | 91 +------------------
 .../search/uhighlight/NoOpOffsetStrategy.java |  4 +-
 .../uhighlight/TokenStreamOffsetStrategy.java | 24 ++---
 .../search/uhighlight/UHComponents.java       |  7 +-
 .../search/uhighlight/UnifiedHighlighter.java |  7 +-
 .../TestUnifiedHighlighterExtensibility.java  |  6 +-
 .../intervals/MultiTermIntervalsSource.java   |  2 +-
 17 files changed, 240 insertions(+), 148 deletions(-)
 create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java
 create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1c346928dd8..de611432991 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -92,6 +92,10 @@ Improvements
 
 * LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev)
 
+* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries
+  that match a class of terms to pass a ByteRunAutomaton matching those that class
+  back to the visitor. (Alan Woodward, David Smiley)
+
 Optimizations
 
 * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
index ed71c4a0dbe..08b9eeff6fc 100644
--- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
@@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
 
   @Override
   public void visit(QueryVisitor visitor) {
-    if (visitor.acceptField(getField())) {
-      visitor.visitLeaf(this);
+    if (visitor.acceptField(field)) {
+      compiled.visit(visitor, this, field);
     }
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
index f136f7e9583..279f9e777b6 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
@@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.Operations;
 
 /** Implements the fuzzy search query. The similarity measurement
  * is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
@@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery {
 
   @Override
   public void visit(QueryVisitor visitor) {
-    // TODO find some way of consuming Automata
-    if (visitor.acceptField(term.field())) {
-      visitor.visitLeaf(this);
+    if (visitor.acceptField(field)) {
+      if (maxEdits == 0 || prefixLength >= term.text().length()) {
+        visitor.consumeTerms(this, term);
+      } else {
+        // Note: we're rebuilding the automaton here, so this can be expensive
+        visitor.consumeTermsMatching(this, field,
+            new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
+      }
     }
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
index 5635f7d5d20..15a0eaffffe 100644
--- a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
@@ -21,6 +21,7 @@ import java.util.Arrays;
 import java.util.Set;
 
 import org.apache.lucene.index.Term;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
 
 /**
  * Allows recursion through a query tree
@@ -37,8 +38,18 @@ public abstract class QueryVisitor {
    */
   public void consumeTerms(Query query, Term... terms) { }
 
-  // TODO it would be nice to have a way to consume 'classes' of Terms from
-  // things like AutomatonQuery
+  /**
+   * Called by leaf queries that match on a class of terms
+   *
+   * @param query     the leaf query
+   * @param field     the field queried against
+   * @param automaton an automaton defining which terms match
+   *
+   * @lucene.experimental
+   */
+  public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+    visitLeaf(query); // default impl for backward compatibility
+  }
 
   /**
    * Called by leaf queries that do not match on terms
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
index abd5109e655..abe7560f431 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.util.automaton;
 
-
 /**
  * Automaton representation for matching UTF-8 byte[].
  */
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index 55800dbb60f..1c9a2354b03 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -16,14 +16,17 @@
  */
 package org.apache.lucene.util.automaton;
 
-  
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.lucene.index.SingleTermsEnum;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
@@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable {
     }
   }
 
+  /**
+   * Report back to a QueryVisitor how this automaton matches terms
+   */
+  public void visit(QueryVisitor visitor, Query parent, String field) {
+    if (visitor.acceptField(field)) {
+      switch (type) {
+        case NORMAL:
+          visitor.consumeTermsMatching(parent, field, runAutomaton);
+          break;
+        case NONE:
+          break;
+        case ALL:
+          visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
+          break;
+        case SINGLE:
+          visitor.consumeTerms(parent, new Term(field, term));
+          break;
+      }
+    }
+  }
+
   /** Finds largest term accepted by this Automaton, that's
    *  &lt;= the provided input term.  The result is placed in
    *  output; it's fine for output and input to point to
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java
new file mode 100644
index 00000000000..75d5606e049
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.util.List;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Matches a character array
+ *
+ * @lucene.internal
+ */
+public interface CharArrayMatcher {
+
+  /**
+   * Return {@code true} if the passed-in character array matches
+   */
+  boolean match(char[] s, int offset, int length);
+
+  /**
+   * Return {@code true} if the passed-in CharsRef matches
+   */
+  default boolean match(CharsRef chars) {
+    return match(chars.chars, chars.offset, chars.length);
+  }
+
+  static CharArrayMatcher fromTerms(List<BytesRef> terms) {
+    CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
+    return a::run;
+  }
+
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
index c63896a428f..d7c936f41cc 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
@@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
  * Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text.  Needs
@@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy {
   }
 
   protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
-    final CharacterRunAutomaton[] automata = components.getAutomata();
+    final LabelledCharArrayMatcher[] automata = components.getAutomata();
     List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
     for (int i = 0; i < automata.length; i++) {
       automataPostings.add(new ArrayList<>());
@@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy {
     CharsRefBuilder refBuilder = new CharsRefBuilder();
     while ((term = termsEnum.next()) != null) {
       for (int i = 0; i < automata.length; i++) {
-        CharacterRunAutomaton automaton = automata[i];
+        CharArrayMatcher automaton = automata[i];
         refBuilder.copyUTF8Bytes(term);
-        if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
+        if (automaton.match(refBuilder.get())) {
           PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
           if (doc == postings.advance(doc)) {
             automataPostings.get(i).add(postings);
@@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy {
     }
 
     for (int i = 0; i < automata.length; i++) {
-      CharacterRunAutomaton automaton = automata[i];
+      LabelledCharArrayMatcher automaton = automata[i];
       List<PostingsEnum> postingsEnums = automataPostings.get(i);
       if (postingsEnums.isEmpty()) {
         continue;
       }
-      // Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq
-      BytesRef wildcardTerm = new BytesRef(automaton.toString());
+      // Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq
+      BytesRef wildcardTerm = new BytesRef(automaton.getLabel());
       int sumFreq = 0;
       for (PostingsEnum postingsEnum : postingsEnums) {
         sumFreq += postingsEnum.freq();
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java
new file mode 100644
index 00000000000..c2a50aee9d0
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+
+/**
+ * Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting
+ *
+ * @lucene.internal
+ */
+public interface LabelledCharArrayMatcher extends CharArrayMatcher {
+
+  /**
+   * @return the label for this matcher
+   */
+  String getLabel();
+
+  /**
+   * Associates a label with a CharArrayMatcher
+   */
+  static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) {
+    return new LabelledCharArrayMatcher() {
+      @Override
+      public String getLabel() {
+        return label;
+      }
+
+      @Override
+      public boolean match(char[] s, int offset, int length) {
+        return in.match(s, offset, length);
+      }
+    };
+  }
+
+  /**
+   * Returns a representation of the automaton that matches char[] instead of byte[]
+   */
+  static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
+      return wrap(label, (chars, offset, length) -> {
+        int state = 0;
+        final int maxIdx = offset + length;
+        for (int i = offset; i < maxIdx; i++) {
+          final int code = chars[i];
+          int b;
+          // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
+          if (code < 0x80) {
+            state = runAutomaton.step(state, code);
+            if (state == -1) return false;
+          } else if (code < 0x800) {
+            b = (0xC0 | (code >> 6));
+            state = runAutomaton.step(state, b);
+            if (state == -1) return false;
+            b = (0x80 | (code & 0x3F));
+            state = runAutomaton.step(state, b);
+            if (state == -1) return false;
+          } else {
+            // more complex
+            byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
+            int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
+            for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
+              state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
+              if (state == -1) return false;
+            }
+            break;
+          }
+        }
+        return runAutomaton.isAccept(state);
+      });
+  }
+
+}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
index e53d6e48e77..e67cef3a789 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 
 /**
@@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
 
   private final MemoryIndex memoryIndex;
   private final LeafReader memIndexLeafReader;
-  private final CharacterRunAutomaton preMemIndexFilterAutomaton;
+  private final CharArrayMatcher preMemIndexFilterAutomaton;
 
   public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
     super(components, analyzer);
@@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
   }
 
   /**
-   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+   * Build one {@link CharArrayMatcher} matching any term the query might match.
    */
-  private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) {
+  private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
     // We don't know enough about the query to do this confidently
     if (components.getTerms() == null || components.getAutomata() == null) {
       return null;
     }
 
-    List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
+    List<CharArrayMatcher> allAutomata = new ArrayList<>();
     if (components.getTerms().length > 0) {
-      allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms()))));
+      allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
     }
     Collections.addAll(allAutomata, components.getAutomata());
     for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
@@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
     if (allAutomata.size() == 1) {
       return allAutomata.get(0);
     }
+
     //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
     //  could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
 
-    // Return an aggregate CharacterRunAutomaton of others
-    return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
-      @Override
-      public boolean run(char[] chars, int offset, int length) {
-        for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
-          if (allAutomata.get(i).run(chars, offset, length)) {
-            return true;
-          }
+    // Return an aggregate CharArrayMatcher of others
+    return (chars, offset, length) -> {
+      for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
+        if (allAutomata.get(i).match(chars, offset, length)) {
+          return true;
         }
-        return false;
       }
+      return false;
     };
   }
 
@@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
   }
 
   private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
-                                                        final CharacterRunAutomaton charRunAutomaton) {
+                                                        final CharArrayMatcher matcher) {
     // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
     return new FilteringTokenFilter(tokenStream) {
       final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
 
       @Override
       protected boolean accept() throws IOException {
-        return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
+        return matcher.match(charAtt.buffer(), 0, charAtt.length());
       }
     };
   }
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
index 8181c2613ee..ba8e85e9900 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;
 
 /**
  * Support for highlighting multi-term queries.
@@ -46,11 +41,10 @@ final class MultiTermHighlighting {
    * Extracts MultiTermQueries that match the provided field predicate.
    * Returns equivalent automata that will match terms.
    */
-  static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
-
+  static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
     AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher);
     query.visit(collector);
-    return collector.runAutomata.toArray(new CharacterRunAutomaton[0]);
+    return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]);
   }
 
   /**
@@ -63,7 +57,7 @@ final class MultiTermHighlighting {
 
   private static class AutomataCollector extends QueryVisitor {
 
-    List<CharacterRunAutomaton> runAutomata = new ArrayList<>();
+    List<LabelledCharArrayMatcher> runAutomata = new ArrayList<>();
     final boolean lookInSpan;
     final Predicate<String> fieldMatcher;
 
@@ -86,85 +80,10 @@ final class MultiTermHighlighting {
     }
 
     @Override
-    public void visitLeaf(Query query) {
-      if (query instanceof AutomatonQuery) {
-        AutomatonQuery aq = (AutomatonQuery) query;
-        if (aq.isAutomatonBinary() == false) {
-          // WildcardQuery, RegexpQuery
-          runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) {
-            @Override
-            public String toString() {
-              return query.toString();
-            }
-          });
-        }
-        else {
-          runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString()));
-        }
-      }
-      else if (query instanceof FuzzyQuery) {
-        FuzzyQuery fq = (FuzzyQuery) query;
-        if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) {
-          consumeTerms(query, fq.getTerm());
-        }
-        else {
-          runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){
-            @Override
-            public String toString() {
-              return query.toString();
-            }
-          });
-        }
-      }
+    public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+      runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
     }
 
   }
 
-  private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) {
-    return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
-      //   TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
-      ByteRunAutomaton byteRunAutomaton =
-          new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
-
-      @Override
-      public String toString() {
-        return description;
-      }
-
-      @Override
-      public boolean run(char[] chars, int offset, int length) {
-        int state = 0;
-        final int maxIdx = offset + length;
-        for (int i = offset; i < maxIdx; i++) {
-          final int code = chars[i];
-          int b;
-          // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
-          if (code < 0x80) {
-            state = byteRunAutomaton.step(state, code);
-            if (state == -1) return false;
-          } else if (code < 0x800) {
-            b = (0xC0 | (code >> 6));
-            state = byteRunAutomaton.step(state, b);
-            if (state == -1) return false;
-            b = (0x80 | (code & 0x3F));
-            state = byteRunAutomaton.step(state, b);
-            if (state == -1) return false;
-          } else {
-            // more complex
-            byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
-            int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
-            for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
-              state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
-              if (state == -1) return false;
-            }
-            break;
-          }
-        }
-        return byteRunAutomaton.isAccept(state);
-      }
-    };
-  }
-
-
-
 }
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java
index 08f2b128892..d69d1cb33c3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java
@@ -22,7 +22,6 @@ import java.util.Collections;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
  * Never returns offsets. Used when the query would highlight nothing.
@@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
   public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();
 
   private NoOpOffsetStrategy() {
-    super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet()));
+    super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(),
+        new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet()));
   }
 
   @Override
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
index a7282b6b5af..c8729140296 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
  */
 public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
 
-  private final CharacterRunAutomaton[] combinedAutomata;
+  private final CharArrayMatcher[] combinedAutomata;
 
   public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
     super(components, indexAnalyzer);
     assert components.getPhraseHelper().hasPositionSensitivity() == false;
-    combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata());
+    combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata());
   }
 
   //TODO this is inefficient; instead build a union automata just for terms part.
-  private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
-    CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
+  private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
+    CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
     for (int i = 0; i < terms.length; i++) {
       String termString = terms[i].utf8ToString();
-      newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
-        @Override
-        public String toString() {
-          return termString;
-        }
-      };
+      CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
+      newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
     }
     // Append existing automata (that which is used for MTQs)
-    System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
+    System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
     return newAutomata;
   }
 
@@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
 
   private static class TokenStreamOffsetsEnum extends OffsetsEnum {
     TokenStream stream; // becomes null when closed
-    final CharacterRunAutomaton[] matchers;
+    final CharArrayMatcher[] matchers;
     final CharTermAttribute charTermAtt;
     final OffsetAttribute offsetAtt;
 
@@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
 
     final BytesRef matchDescriptions[];
 
-    TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
+    TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
       this.stream = ts;
       this.matchers = matchers;
       matchDescriptions = new BytesRef[matchers.length];
@@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
       if (stream != null) {
         while (stream.incrementToken()) {
           for (int i = 0; i < matchers.length; i++) {
-            if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+            if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) {
               currentMatch = i;
               return true;
             }
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java
index 4af6d7098c4..65dd84b5fa3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java
@@ -22,7 +22,6 @@ import java.util.function.Predicate;
 
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
  * A parameter object to hold the components a {@link FieldOffsetStrategy} needs.
@@ -35,12 +34,12 @@ public class UHComponents {
   private final Query query;
   private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
   private final PhraseHelper phraseHelper; // Query: position-sensitive information
-  private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
+  private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
   private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
   private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;
 
   public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
-                      BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
+                      BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata,
                       boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
     this.field = field;
     this.fieldMatcher = fieldMatcher;
@@ -72,7 +71,7 @@ public class UHComponents {
     return phraseHelper;
   }
 
-  public CharacterRunAutomaton[] getAutomata() {
+  public LabelledCharArrayMatcher[] getAutomata() {
     return automata;
   }
 
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
index e6c0742d3d1..74de2483b8a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -62,7 +62,6 @@ import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.InPlaceMergeSorter;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
  * A Highlighter that can get offsets from either
@@ -110,7 +109,7 @@ public class UnifiedHighlighter {
     }
   }
 
-  protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0];
+  protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0];
 
   protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
 
@@ -769,7 +768,7 @@ public class UnifiedHighlighter {
     PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
     boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
     BytesRef[] terms = null;
-    CharacterRunAutomaton[] automata = null;
+    LabelledCharArrayMatcher[] automata = null;
     if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
       terms = filterExtractedTerms(fieldMatcher, allTerms);
       automata = getAutomata(field, query, highlightFlags);
@@ -839,7 +838,7 @@ public class UnifiedHighlighter {
         : PhraseHelper.NONE;
   }
 
-  protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
+  protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
     // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those?
     // if don't highlight phrases strictly,
     final boolean lookInSpan =
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
index def44319d63..33bc7e1d2a0 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@@ -36,6 +36,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.uhighlight.FieldHighlighter;
 import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
+import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
 import org.apache.lucene.search.uhighlight.OffsetsEnum;
 import org.apache.lucene.search.uhighlight.Passage;
 import org.apache.lucene.search.uhighlight.PassageFormatter;
@@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.junit.Test;
 
 /**
@@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
         (s) -> false,
         new MatchAllDocsQuery(), new BytesRef[0],
         PhraseHelper.NONE,
-        new CharacterRunAutomaton[0], false, Collections.emptySet())) {
+        new LabelledCharArrayMatcher[0], false, Collections.emptySet())) {
       @Override
       public UnifiedHighlighter.OffsetSource getOffsetSource() {
         return offsetSource;
@@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
         BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
         Set<HighlightFlag> highlightFlags = getFlags(field);
         PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
-        CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+        LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
         boolean queryHasUnrecognizedPart = false;
         return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
       }
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java
index 5fb6389d311..589f9c6e3fc 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java
@@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource {
 
   @Override
   public void visit(String field, QueryVisitor visitor) {
-    visitor.visitLeaf(new IntervalQuery(field, this));
+    automaton.visit(visitor, new IntervalQuery(field, this), field);
   }
 
   @Override