LUCENE-9098 Report bad term for fuzzy query

When a fuzzy query encounters a term that is too complex, the exception
should report the term instead of a cryptic message about too many
states.
This commit is contained in:
Mike Drob 2019-12-17 15:18:06 -06:00
parent 907d1142fa
commit a4c884a22f
2 changed files with 80 additions and 2 deletions

View File

@ -37,6 +37,7 @@ import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/** Subclass of TermsEnum for enumerating all terms that are similar
* to the specified filter term.
@ -131,7 +132,11 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
prevAutomata = new CompiledAutomaton[maxEdits+1];
Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
for (int i = 0; i <= maxEdits; i++) {
prevAutomata[i] = new CompiledAutomaton(automata[i], true, false);
try {
prevAutomata[i] = new CompiledAutomaton(automata[i], true, false);
} catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsException(term.text(), e);
}
}
// first segment computes the automata, and we share with subsequent segments via this Attribute:
dfaAtt.setAutomata(prevAutomata);
@ -407,4 +412,15 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
reflector.reflect(LevenshteinAutomataAttribute.class, "automata", automata);
}
}
/**
* Thrown to indicate that there was an issue creating a fuzzy query for a given term.
* Typically occurs with terms longer than 220 UTF-8 characters,
* but also possible with shorter terms consisting of UTF-32 code points.
*/
public static class FuzzyTermsException extends RuntimeException {
private FuzzyTermsException(String term, Throwable cause) {
super("Term too complex: " + term, cause);
}
}
}

View File

@ -25,6 +25,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -35,6 +36,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
@ -43,6 +46,9 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import static org.hamcrest.CoreMatchers.containsString;
/**
* Tests {@link FuzzyQuery}.
@ -492,7 +498,63 @@ public class TestFuzzyQuery extends LuceneTestCase {
});
assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
}
public void testErrorMessage() {
// 45 states per vector from Lev2TParametricDescription
int length = (Operations.DEFAULT_MAX_DETERMINIZED_STATES / 45) + 10;
String value = RandomizedTest.randomRealisticUnicodeOfCodepointLength(length);
FuzzyTermsEnum.FuzzyTermsException expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class, () -> {
new FuzzyQuery(new Term("field", value)).getTermsEnum(new Terms() {
@Override
public TermsEnum iterator() {
throw new UnsupportedOperationException();
}
@Override
public long size() {
throw new UnsupportedOperationException();
}
@Override
public long getSumTotalTermFreq() {
throw new UnsupportedOperationException();
}
@Override
public long getSumDocFreq() {
throw new UnsupportedOperationException();
}
@Override
public int getDocCount() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasFreqs() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasOffsets() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasPositions() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasPayloads() {
throw new UnsupportedOperationException();
}
});
});
assertThat(expected.getMessage(), containsString(value));
}
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newTextField("field", text, Field.Store.YES));