From 188a78d769750be6d95f68c13d422debdd5acd7a Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Sat, 24 Sep 2022 11:26:16 +0100 Subject: [PATCH] Don't try to highlight very long terms (#11808) The UnifiedHighlighter can throw exceptions when highlighting terms that are longer than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing a confusing exception, we can instead filter out the long terms when building the MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case. --- .../DaciukMihovAutomatonBuilder.java | 2 +- .../uhighlight/MemoryIndexOffsetStrategy.java | 10 ++++++- .../uhighlight/TestUnifiedHighlighter.java | 29 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 26952b678e3..d96a88904d0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder { * This builder rejects terms that are more than 1k chars long since it then uses recursion based * on the length of the string, which might cause stack overflows. */ - static final int MAX_TERM_LENGTH = 1_000; + public static final int MAX_TERM_LENGTH = 1_000; /** The default constructor is private. Use static methods directly. */ private DaciukMihovAutomatonBuilder() { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index 79dbcbea44b..7237be57807 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -28,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queries.spans.SpanQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; /** * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}. @@ -60,7 +62,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { List allAutomata = new ArrayList<>(); if (components.getTerms().length > 0) { - allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms()))); + // Filter out any long terms that would otherwise cause exceptions if we tried + // to build an automaton on them + List filteredTerms = + Arrays.stream(components.getTerms()) + .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH) + .toList(); + allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms)); } Collections.addAll(allAutomata, components.getAutomata()); for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 482644d4243..6bad8ddbe5e 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; import org.junit.After; import org.junit.Before; @@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase { ir.close(); } + + public void testQueryWithLongTerm() throws IOException { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = + randomUnifiedHighlighter( + searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true); + + Query query = + new BooleanQuery.Builder() + .add( + new TermQuery( + new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))), + BooleanClause.Occur.SHOULD) + .add( + new TermQuery( + new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))), + BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD) + .build(); + + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + + String[] snippets = highlighter.highlight("title", query, topDocs); + assertArrayEquals(new String[] {"This is the title field."}, snippets); + + ir.close(); + } }