Don't try to highlight very long terms (#11808)

The UnifiedHighlighter can throw exceptions when highlighting terms that are longer
than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing
a confusing exception, we can instead filter out the long terms when building the
MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case.
This commit is contained in:
Alan Woodward 2022-09-24 11:26:16 +01:00 committed by GitHub
parent 3a04aa44c2
commit 188a78d769
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 2 deletions

View File

@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder {
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
* on the length of the string, which might cause stack overflows.
*/
static final int MAX_TERM_LENGTH = 1_000;
public static final int MAX_TERM_LENGTH = 1_000;
/** The default constructor is private. Use static methods directly. */
private DaciukMihovAutomatonBuilder() {

View File

@ -28,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@ -60,7 +62,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
List<CharArrayMatcher> allAutomata = new ArrayList<>();
if (components.getTerms().length > 0) {
allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
// Filter out any long terms that would otherwise cause exceptions if we tried
// to build an automaton on them
List<BytesRef> filteredTerms =
Arrays.stream(components.getTerms())
.filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
.toList();
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
}
Collections.addAll(allAutomata, components.getAutomata());
for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {

View File

@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.junit.After;
import org.junit.Before;
@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
ir.close();
}
public void testQueryWithLongTerm() throws IOException {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter =
randomUnifiedHighlighter(
searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true);
Query query =
new BooleanQuery.Builder()
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
BooleanClause.Occur.SHOULD)
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
String[] snippets = highlighter.highlight("title", query, topDocs);
assertArrayEquals(new String[] {"This is the <b>title</b> field."}, snippets);
ir.close();
}
}