From 188a78d769750be6d95f68c13d422debdd5acd7a Mon Sep 17 00:00:00 2001
From: Alan Woodward <romseygeek@apache.org>
Date: Sat, 24 Sep 2022 11:26:16 +0100
Subject: [PATCH] Don't try to highlight very long terms (#11808)

The UnifiedHighlighter can throw exceptions when highlighting terms that are longer
than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing
a confusing exception, we can instead filter out the long terms when building the
MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case.
---
 .../DaciukMihovAutomatonBuilder.java          |  2 +-
 .../uhighlight/MemoryIndexOffsetStrategy.java | 10 ++++++-
 .../uhighlight/TestUnifiedHighlighter.java    | 29 +++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
index 26952b678e3..d96a88904d0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
@@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder {
    * This builder rejects terms that are more than 1k chars long since it then uses recursion based
    * on the length of the string, which might cause stack overflows.
    */
-  static final int MAX_TERM_LENGTH = 1_000;
+  public static final int MAX_TERM_LENGTH = 1_000;
 
   /** The default constructor is private. Use static methods directly. */
   private DaciukMihovAutomatonBuilder() {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
index 79dbcbea44b..7237be57807 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -28,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 
 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -60,7 +62,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
 
     List<CharArrayMatcher> allAutomata = new ArrayList<>();
     if (components.getTerms().length > 0) {
-      allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
+      // Filter out any long terms that would otherwise cause exceptions if we tried
+      // to build an automaton on them
+      List<BytesRef> filteredTerms =
+          Arrays.stream(components.getTerms())
+              .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+              .toList();
+      allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
     }
     Collections.addAll(allAutomata, components.getAutomata());
     for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
index 482644d4243..6bad8ddbe5e 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.junit.After;
 import org.junit.Before;
 
@@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
 
     ir.close();
   }
+
+  public void testQueryWithLongTerm() throws IOException {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighter =
+        randomUnifiedHighlighter(
+            searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true);
+
+    Query query =
+        new BooleanQuery.Builder()
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+                BooleanClause.Occur.SHOULD)
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+                BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
+            .build();
+
+    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+
+    String[] snippets = highlighter.highlight("title", query, topDocs);
+    assertArrayEquals(new String[] {"This is the <b>title</b> field."}, snippets);
+
+    ir.close();
+  }
 }