Don't try to highlight very long terms (#11808)

The UnifiedHighlighter can throw exceptions when highlighting terms that are longer than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing a confusing exception, we can instead filter out the long terms when building the MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case.
2022-09-24 11:26:16 +01:00 · 2022-09-24 11:26:16 +01:00 · 188a78d769
parent 3a04aa44c2
commit 188a78d769
3 changed files with 39 additions and 2 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder {
   * This builder rejects terms that are more than 1k chars long since it then uses recursion based
   * on the length of the string, which might cause stack overflows.
   */
-  static final int MAX_TERM_LENGTH = 1_000;
+  public static final int MAX_TERM_LENGTH = 1_000;

  /** The default constructor is private. Use static methods directly. */
  private DaciukMihovAutomatonBuilder() {
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@ -28,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;

 /**
 * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@ -60,7 +62,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {

    List<CharArrayMatcher> allAutomata = new ArrayList<>();
    if (components.getTerms().length > 0) {
-      allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
+      // Filter out any long terms that would otherwise cause exceptions if we tried
+      // to build an automaton on them
+      List<BytesRef> filteredTerms =
+          Arrays.stream(components.getTerms())
+              .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+              .toList();
+      allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
    }
    Collections.addAll(allAutomata, components.getAutomata());
    for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.junit.After;
 import org.junit.Before;

@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase {

    ir.close();
  }
+
+  public void testQueryWithLongTerm() throws IOException {
+    IndexReader ir = indexSomeFields();
+    IndexSearcher searcher = newSearcher(ir);
+    UnifiedHighlighter highlighter =
+        randomUnifiedHighlighter(
+            searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true);
+
+    Query query =
+        new BooleanQuery.Builder()
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+                BooleanClause.Occur.SHOULD)
+            .add(
+                new TermQuery(
+                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+                BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
+            .build();
+
+    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+
+    String[] snippets = highlighter.highlight("title", query, topDocs);
+    assertArrayEquals(new String[] {"This is the <b>title</b> field."}, snippets);
+
+    ir.close();
+  }
 }