mirror of https://github.com/apache/lucene.git
Don't try to highlight very long terms (#11808)
The UnifiedHighlighter can throw exceptions when highlighting terms that are longer than the maximum size the DaciukMihovAutomatonBuilder accepts. Rather than throwing a confusing exception, we can instead filter out the long terms when building the MemoryIndexOffsetStrategy. Very long terms are likely to be junk input in any case.
This commit is contained in:
parent
3a04aa44c2
commit
188a78d769
|
@ -35,7 +35,7 @@ public final class DaciukMihovAutomatonBuilder {
|
|||
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
|
||||
* on the length of the string, which might cause stack overflows.
|
||||
*/
|
||||
static final int MAX_TERM_LENGTH = 1_000;
|
||||
public static final int MAX_TERM_LENGTH = 1_000;
|
||||
|
||||
/** The default constructor is private. Use static methods directly. */
|
||||
private DaciukMihovAutomatonBuilder() {
|
||||
|
|
|
@ -28,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.queries.spans.SpanQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
|
||||
|
||||
/**
|
||||
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
|
||||
|
@ -60,7 +62,13 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
|
||||
List<CharArrayMatcher> allAutomata = new ArrayList<>();
|
||||
if (components.getTerms().length > 0) {
|
||||
allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
|
||||
// Filter out any long terms that would otherwise cause exceptions if we tried
|
||||
// to build an automaton on them
|
||||
List<BytesRef> filteredTerms =
|
||||
Arrays.stream(components.getTerms())
|
||||
.filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
|
||||
.toList();
|
||||
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
|
||||
}
|
||||
Collections.addAll(allAutomata, components.getAutomata());
|
||||
for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
|
||||
|
|
|
@ -58,6 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
|
|||
import org.apache.lucene.tests.analysis.MockTokenizer;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
||||
|
@ -1659,4 +1660,32 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
|
|||
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testQueryWithLongTerm() throws IOException {
|
||||
IndexReader ir = indexSomeFields();
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
UnifiedHighlighter highlighter =
|
||||
randomUnifiedHighlighter(
|
||||
searcher, indexAnalyzer, EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true);
|
||||
|
||||
Query query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(
|
||||
new TermQuery(
|
||||
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
|
||||
BooleanClause.Occur.SHOULD)
|
||||
.add(
|
||||
new TermQuery(
|
||||
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
|
||||
BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
|
||||
|
||||
String[] snippets = highlighter.highlight("title", query, topDocs);
|
||||
assertArrayEquals(new String[] {"This is the <b>title</b> field."}, snippets);
|
||||
|
||||
ir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue