diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 47592203b3c..91c67c0236c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -37,6 +37,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; /** Subclass of TermsEnum for enumerating all terms that are similar * to the specified filter term. @@ -131,7 +132,11 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { prevAutomata = new CompiledAutomaton[maxEdits+1]; Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits); for (int i = 0; i <= maxEdits; i++) { - prevAutomata[i] = new CompiledAutomaton(automata[i], true, false); + try { + prevAutomata[i] = new CompiledAutomaton(automata[i], true, false); + } catch (TooComplexToDeterminizeException e) { + throw new FuzzyTermsException(term.text(), e); + } } // first segment computes the automata, and we share with subsequent segments via this Attribute: dfaAtt.setAutomata(prevAutomata); @@ -407,4 +412,15 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { reflector.reflect(LevenshteinAutomataAttribute.class, "automata", automata); } } + + /** + * Thrown to indicate that there was an issue creating a fuzzy query for a given term. + * Typically occurs with terms longer than 220 UTF-8 characters, + * but also possible with shorter terms consisting of UTF-32 code points. + */ + public static class FuzzyTermsException extends RuntimeException { + private FuzzyTermsException(String term, Throwable cause) { + super("Term too complex: " + term, cause); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 024abc3c95f..f882ee75446 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -25,6 +25,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import com.carrotsearch.randomizedtesting.RandomizedTest; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; @@ -35,6 +36,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; @@ -43,6 +46,9 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.Operations; + +import static org.hamcrest.CoreMatchers.containsString; /** * Tests {@link FuzzyQuery}. @@ -492,7 +498,63 @@ public class TestFuzzyQuery extends LuceneTestCase { }); assertTrue(expected.getMessage().contains("maxExpansions must be positive")); } - + + public void testErrorMessage() { + // 45 states per vector from Lev2TParametricDescription + int length = (Operations.DEFAULT_MAX_DETERMINIZED_STATES / 45) + 10; + + String value = RandomizedTest.randomRealisticUnicodeOfCodepointLength(length); + FuzzyTermsEnum.FuzzyTermsException expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class, () -> { + new FuzzyQuery(new Term("field", value)).getTermsEnum(new Terms() { + @Override + public TermsEnum iterator() { + throw new UnsupportedOperationException(); + } + + @Override + public long size() { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumTotalTermFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumDocFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public int getDocCount() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasFreqs() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasOffsets() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPositions() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPayloads() { + throw new UnsupportedOperationException(); + } + }); + }); + assertThat(expected.getMessage(), containsString(value)); + } + private void addDoc(String text, RandomIndexWriter writer) throws IOException { Document doc = new Document(); doc.add(newTextField("field", text, Field.Store.YES));