LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037)

This commit adds a consumeTermsMatching() method to QueryVisitor, allowing queries that match against a class of terms to report this back to the visitor. It also changes highlighting code to use this new method, replacing the current implementation via instanceof checks.
2019-11-27 16:28:19 +00:00 · 2019-11-27 16:28:19 +00:00 · bed694ec88
parent 47a908a0b9
commit bed694ec88
17 changed files with 240 additions and 148 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -92,6 +92,10 @@ Improvements

 * LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev)

+* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries
+  that match a class of terms to pass a ByteRunAutomaton matching those that class
+  back to the visitor. (Alan Woodward, David Smiley)
+
 Optimizations

 * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
--- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {

  @Override
  public void visit(QueryVisitor visitor) {
-    if (visitor.acceptField(getField())) {
-      visitor.visitLeaf(this);
+    if (visitor.acceptField(field)) {
+      compiled.visit(visitor, this, field);
    }
  }

--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.Operations;

 /** Implements the fuzzy search query. The similarity measurement
 * is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery {

  @Override
  public void visit(QueryVisitor visitor) {
-    // TODO find some way of consuming Automata
-    if (visitor.acceptField(term.field())) {
-      visitor.visitLeaf(this);
+    if (visitor.acceptField(field)) {
+      if (maxEdits == 0 || prefixLength >= term.text().length()) {
+        visitor.consumeTerms(this, term);
+      } else {
+        // Note: we're rebuilding the automaton here, so this can be expensive
+        visitor.consumeTermsMatching(this, field,
+            new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
+      }
    }
  }

--- a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
@ -21,6 +21,7 @@ import java.util.Arrays;
 import java.util.Set;

 import org.apache.lucene.index.Term;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;

 /**
 * Allows recursion through a query tree
@ -37,8 +38,18 @@ public abstract class QueryVisitor {
   */
  public void consumeTerms(Query query, Term... terms) { }

-  // TODO it would be nice to have a way to consume 'classes' of Terms from
-  // things like AutomatonQuery
+  /**
+   * Called by leaf queries that match on a class of terms
+   *
+   * @param query     the leaf query
+   * @param field     the field queried against
+   * @param automaton an automaton defining which terms match
+   *
+   * @lucene.experimental
+   */
+  public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+    visitLeaf(query); // default impl for backward compatibility
+  }

  /**
   * Called by leaf queries that do not match on terms
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@ -16,7 +16,6 @@
 */
 package org.apache.lucene.util.automaton;

-
 /**
 * Automaton representation for matching UTF-8 byte[].
 */
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@ -22,8 +22,11 @@ import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.index.SingleTermsEnum;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable {
    }
  }

+  /**
+   * Report back to a QueryVisitor how this automaton matches terms
+   */
+  public void visit(QueryVisitor visitor, Query parent, String field) {
+    if (visitor.acceptField(field)) {
+      switch (type) {
+        case NORMAL:
+          visitor.consumeTermsMatching(parent, field, runAutomaton);
+          break;
+        case NONE:
+          break;
+        case ALL:
+          visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
+          break;
+        case SINGLE:
+          visitor.consumeTerms(parent, new Term(field, term));
+          break;
+      }
+    }
+  }
+
  /** Finds largest term accepted by this Automaton, that's
   *  &lt;= the provided input term.  The result is placed in
   *  output; it's fine for output and input to point to
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CharArrayMatcher.java
@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.util.List;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Matches a character array
+ *
+ * @lucene.internal
+ */
+public interface CharArrayMatcher {
+
+  /**
+   * Return {@code true} if the passed-in character array matches
+   */
+  boolean match(char[] s, int offset, int length);
+
+  /**
+   * Return {@code true} if the passed-in CharsRef matches
+   */
+  default boolean match(CharsRef chars) {
+    return match(chars.chars, chars.offset, chars.length);
+  }
+
+  static CharArrayMatcher fromTerms(List<BytesRef> terms) {
+    CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
+    return a::run;
+  }
+
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
 * Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text.  Needs
@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy {
  }

  protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
-    final CharacterRunAutomaton[] automata = components.getAutomata();
+    final LabelledCharArrayMatcher[] automata = components.getAutomata();
    List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
      automataPostings.add(new ArrayList<>());
@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy {
    CharsRefBuilder refBuilder = new CharsRefBuilder();
    while ((term = termsEnum.next()) != null) {
      for (int i = 0; i < automata.length; i++) {
-        CharacterRunAutomaton automaton = automata[i];
+        CharArrayMatcher automaton = automata[i];
        refBuilder.copyUTF8Bytes(term);
-        if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
+        if (automaton.match(refBuilder.get())) {
          PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
          if (doc == postings.advance(doc)) {
            automataPostings.get(i).add(postings);
@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy {
    }

    for (int i = 0; i < automata.length; i++) {
-      CharacterRunAutomaton automaton = automata[i];
+      LabelledCharArrayMatcher automaton = automata[i];
      List<PostingsEnum> postingsEnums = automataPostings.get(i);
      if (postingsEnums.isEmpty()) {
        continue;
      }
-      // Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq
-      BytesRef wildcardTerm = new BytesRef(automaton.toString());
+      // Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq
+      BytesRef wildcardTerm = new BytesRef(automaton.getLabel());
      int sumFreq = 0;
      for (PostingsEnum postingsEnum : postingsEnums) {
        sumFreq += postingsEnum.freq();
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LabelledCharArrayMatcher.java
@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+
+/**
+ * Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting
+ *
+ * @lucene.internal
+ */
+public interface LabelledCharArrayMatcher extends CharArrayMatcher {
+
+  /**
+   * @return the label for this matcher
+   */
+  String getLabel();
+
+  /**
+   * Associates a label with a CharArrayMatcher
+   */
+  static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) {
+    return new LabelledCharArrayMatcher() {
+      @Override
+      public String getLabel() {
+        return label;
+      }
+
+      @Override
+      public boolean match(char[] s, int offset, int length) {
+        return in.match(s, offset, length);
+      }
+    };
+  }
+
+  /**
+   * Returns a representation of the automaton that matches char[] instead of byte[]
+   */
+  static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
+      return wrap(label, (chars, offset, length) -> {
+        int state = 0;
+        final int maxIdx = offset + length;
+        for (int i = offset; i < maxIdx; i++) {
+          final int code = chars[i];
+          int b;
+          // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
+          if (code < 0x80) {
+            state = runAutomaton.step(state, code);
+            if (state == -1) return false;
+          } else if (code < 0x800) {
+            b = (0xC0 | (code >> 6));
+            state = runAutomaton.step(state, b);
+            if (state == -1) return false;
+            b = (0x80 | (code & 0x3F));
+            state = runAutomaton.step(state, b);
+            if (state == -1) return false;
+          } else {
+            // more complex
+            byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
+            int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
+            for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
+              state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
+              if (state == -1) return false;
+            }
+            break;
+          }
+        }
+        return runAutomaton.isAccept(state);
+      });
+  }
+
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;


 /**
@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {

  private final MemoryIndex memoryIndex;
  private final LeafReader memIndexLeafReader;
-  private final CharacterRunAutomaton preMemIndexFilterAutomaton;
+  private final CharArrayMatcher preMemIndexFilterAutomaton;

  public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
    super(components, analyzer);
@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
  }

  /**
-   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+   * Build one {@link CharArrayMatcher} matching any term the query might match.
   */
-  private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) {
+  private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
    // We don't know enough about the query to do this confidently
    if (components.getTerms() == null || components.getAutomata() == null) {
      return null;
    }

-    List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
+    List<CharArrayMatcher> allAutomata = new ArrayList<>();
    if (components.getTerms().length > 0) {
-      allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms()))));
+      allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
    }
    Collections.addAll(allAutomata, components.getAutomata());
    for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
    if (allAutomata.size() == 1) {
      return allAutomata.get(0);
    }
+
    //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
    //  could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented

-    // Return an aggregate CharacterRunAutomaton of others
-    return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
-      @Override
-      public boolean run(char[] chars, int offset, int length) {
-        for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
-          if (allAutomata.get(i).run(chars, offset, length)) {
-            return true;
-          }
+    // Return an aggregate CharArrayMatcher of others
+    return (chars, offset, length) -> {
+      for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
+        if (allAutomata.get(i).match(chars, offset, length)) {
+          return true;
        }
-        return false;
      }
+      return false;
    };
  }

@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
  }

  private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
-                                                        final CharacterRunAutomaton charRunAutomaton) {
+                                                        final CharArrayMatcher matcher) {
    // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
    return new FilteringTokenFilter(tokenStream) {
      final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);

      @Override
      protected boolean accept() throws IOException {
-        return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
+        return matcher.match(charAtt.buffer(), 0, charAtt.length());
      }
    };
  }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.Operations;

 /**
 * Support for highlighting multi-term queries.
@ -46,11 +41,10 @@ final class MultiTermHighlighting {
   * Extracts MultiTermQueries that match the provided field predicate.
   * Returns equivalent automata that will match terms.
   */
-  static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
-
+  static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
    AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher);
    query.visit(collector);
-    return collector.runAutomata.toArray(new CharacterRunAutomaton[0]);
+    return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]);
  }

  /**
@ -63,7 +57,7 @@ final class MultiTermHighlighting {

  private static class AutomataCollector extends QueryVisitor {

-    List<CharacterRunAutomaton> runAutomata = new ArrayList<>();
+    List<LabelledCharArrayMatcher> runAutomata = new ArrayList<>();
    final boolean lookInSpan;
    final Predicate<String> fieldMatcher;

@ -86,85 +80,10 @@ final class MultiTermHighlighting {
    }

    @Override
-    public void visitLeaf(Query query) {
-      if (query instanceof AutomatonQuery) {
-        AutomatonQuery aq = (AutomatonQuery) query;
-        if (aq.isAutomatonBinary() == false) {
-          // WildcardQuery, RegexpQuery
-          runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) {
-            @Override
-            public String toString() {
-              return query.toString();
-            }
-          });
-        }
-        else {
-          runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString()));
-        }
-      }
-      else if (query instanceof FuzzyQuery) {
-        FuzzyQuery fq = (FuzzyQuery) query;
-        if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) {
-          consumeTerms(query, fq.getTerm());
-        }
-        else {
-          runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){
-            @Override
-            public String toString() {
-              return query.toString();
-            }
-          });
-        }
-      }
+    public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+      runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
    }

  }

-  private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) {
-    return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
-      //   TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
-      ByteRunAutomaton byteRunAutomaton =
-          new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
-
-      @Override
-      public String toString() {
-        return description;
-      }
-
-      @Override
-      public boolean run(char[] chars, int offset, int length) {
-        int state = 0;
-        final int maxIdx = offset + length;
-        for (int i = offset; i < maxIdx; i++) {
-          final int code = chars[i];
-          int b;
-          // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
-          if (code < 0x80) {
-            state = byteRunAutomaton.step(state, code);
-            if (state == -1) return false;
-          } else if (code < 0x800) {
-            b = (0xC0 | (code >> 6));
-            state = byteRunAutomaton.step(state, b);
-            if (state == -1) return false;
-            b = (0x80 | (code & 0x3F));
-            state = byteRunAutomaton.step(state, b);
-            if (state == -1) return false;
-          } else {
-            // more complex
-            byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
-            int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
-            for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
-              state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
-              if (state == -1) return false;
-            }
-            break;
-          }
-        }
-        return byteRunAutomaton.isAccept(state);
-      }
-    };
-  }
-
-
-
 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java
@ -22,7 +22,6 @@ import java.util.Collections;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
 * Never returns offsets. Used when the query would highlight nothing.
@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
  public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();

  private NoOpOffsetStrategy() {
-    super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet()));
+    super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(),
+        new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet()));
  }

  @Override
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 */
 public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {

-  private final CharacterRunAutomaton[] combinedAutomata;
+  private final CharArrayMatcher[] combinedAutomata;

  public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
    super(components, indexAnalyzer);
    assert components.getPhraseHelper().hasPositionSensitivity() == false;
-    combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata());
+    combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata());
  }

  //TODO this is inefficient; instead build a union automata just for terms part.
-  private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
-    CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
+  private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
+    CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
    for (int i = 0; i < terms.length; i++) {
      String termString = terms[i].utf8ToString();
-      newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
-        @Override
-        public String toString() {
-          return termString;
-        }
-      };
+      CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
+      newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
    }
    // Append existing automata (that which is used for MTQs)
-    System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
+    System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
    return newAutomata;
  }

@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {

  private static class TokenStreamOffsetsEnum extends OffsetsEnum {
    TokenStream stream; // becomes null when closed
-    final CharacterRunAutomaton[] matchers;
+    final CharArrayMatcher[] matchers;
    final CharTermAttribute charTermAtt;
    final OffsetAttribute offsetAtt;

@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {

    final BytesRef matchDescriptions[];

-    TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
+    TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
      this.stream = ts;
      this.matchers = matchers;
      matchDescriptions = new BytesRef[matchers.length];
@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
      if (stream != null) {
        while (stream.incrementToken()) {
          for (int i = 0; i < matchers.length; i++) {
-            if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+            if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) {
              currentMatch = i;
              return true;
            }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java
@ -22,7 +22,6 @@ import java.util.function.Predicate;

 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
 * A parameter object to hold the components a {@link FieldOffsetStrategy} needs.
@ -35,12 +34,12 @@ public class UHComponents {
  private final Query query;
  private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
  private final PhraseHelper phraseHelper; // Query: position-sensitive information
-  private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
+  private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
  private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
  private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;

  public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
-                      BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
+                      BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata,
                      boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
    this.field = field;
    this.fieldMatcher = fieldMatcher;
@ -72,7 +71,7 @@ public class UHComponents {
    return phraseHelper;
  }

-  public CharacterRunAutomaton[] getAutomata() {
+  public LabelledCharArrayMatcher[] getAutomata() {
    return automata;
  }

--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@ -62,7 +62,6 @@ import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.InPlaceMergeSorter;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
 * A Highlighter that can get offsets from either
@ -110,7 +109,7 @@ public class UnifiedHighlighter {
    }
  }

-  protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0];
+  protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0];

  protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher

@ -769,7 +768,7 @@ public class UnifiedHighlighter {
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
    BytesRef[] terms = null;
-    CharacterRunAutomaton[] automata = null;
+    LabelledCharArrayMatcher[] automata = null;
    if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
      terms = filterExtractedTerms(fieldMatcher, allTerms);
      automata = getAutomata(field, query, highlightFlags);
@ -839,7 +838,7 @@ public class UnifiedHighlighter {
        : PhraseHelper.NONE;
  }

-  protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
+  protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
    // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those?
    // if don't highlight phrases strictly,
    final boolean lookInSpan =
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@ -36,6 +36,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.uhighlight.FieldHighlighter;
 import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
+import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
 import org.apache.lucene.search.uhighlight.OffsetsEnum;
 import org.apache.lucene.search.uhighlight.Passage;
 import org.apache.lucene.search.uhighlight.PassageFormatter;
@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.junit.Test;

 /**
@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
        (s) -> false,
        new MatchAllDocsQuery(), new BytesRef[0],
        PhraseHelper.NONE,
-        new CharacterRunAutomaton[0], false, Collections.emptySet())) {
+        new LabelledCharArrayMatcher[0], false, Collections.emptySet())) {
      @Override
      public UnifiedHighlighter.OffsetSource getOffsetSource() {
        return offsetSource;
@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
        BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
        Set<HighlightFlag> highlightFlags = getFlags(field);
        PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
-        CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+        LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
        boolean queryHasUnrecognizedPart = false;
        return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
      }
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java
@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource {

  @Override
  public void visit(String field, QueryVisitor visitor) {
-    visitor.visitLeaf(new IntervalQuery(field, this));
+    automaton.visit(visitor, new IntervalQuery(field, this), field);
  }

  @Override