Set a hard limit on the number of tokens we run suggestion on

PhraseSuggester can be very slow and CPU intensive if a lot of terms
are suggested. Yet, to prevent cluster instabilty and long running requests
this commit adds a hard limit of by default 10 tokens where we just return
no correction for anymore if the query is parsed into more tokens.

Closes #3164
This commit is contained in:
Simon Willnauer 2013-06-12 15:34:20 +02:00
parent 9d3e34b9f9
commit a654c3d103
6 changed files with 65 additions and 6 deletions

View File

@ -39,20 +39,24 @@ import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidat
//TODO public for tests
public final class NoisyChannelSpellChecker {
public static final double REAL_WORD_LIKELYHOOD = 0.95d;
public static final int DEFAULT_TOKEN_LIMIT = 10;
private final double realWordLikelihood;
private final boolean requireUnigram;
private final int tokenLimit;
public NoisyChannelSpellChecker() {
this(REAL_WORD_LIKELYHOOD);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
this(nonErrorLikelihood, true);
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram) {
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
this.realWordLikelihood = nonErrorLikelihood;
this.requireUnigram = requireUnigram;
this.tokenLimit = tokenLimit;
}
public Correction[] getCorrections(TokenStream stream, final CandidateGenerator generator,
@ -104,7 +108,7 @@ public final class NoisyChannelSpellChecker {
}
});
if (candidateSetsList.isEmpty()) {
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Correction.EMPTY;
}

View File

@ -76,6 +76,12 @@ public final class PhraseSuggestParser implements SuggestContextParser {
gramSizeSet = true;
} else if ("force_unigrams".equals(fieldName) || "forceUnigrams".equals(fieldName)) {
suggestion.setRequireUnigram(parser.booleanValue());
} else if ("token_limit".equals(fieldName) || "tokenLimit".equals(fieldName)) {
int tokenLimit = parser.intValue();
if (tokenLimit <= 0) {
throw new ElasticSearchIllegalArgumentException("token_limit must be >= 1");
}
suggestion.setTokenLimit(tokenLimit);
} else {
throw new ElasticSearchIllegalArgumentException("suggester[phrase] doesn't support field [" + fieldName + "]");
}

View File

@ -61,7 +61,7 @@ public final class PhraseSuggester implements Suggester<PhraseSuggestionContext>
}
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram());
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
final BytesRef separator = suggestion.separator();
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestion.getField(), realWordErrorLikelihood, separator);

View File

@ -43,6 +43,7 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
private Integer gramSize;
private SmoothingModel model;
private Boolean forceUnigrams;
private Integer tokenLimit;
public PhraseSuggestionBuilder(String name) {
super(name, "phrase");
@ -140,6 +141,11 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
this.model = model;
return this;
}
public PhraseSuggestionBuilder tokenLimit(int tokenLimit) {
this.tokenLimit = tokenLimit;
return this;
}
@Override
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
@ -161,6 +167,9 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
if (forceUnigrams != null) {
builder.field("force_unigrams", forceUnigrams);
}
if (tokenLimit != null) {
builder.field("token_limit", tokenLimit);
}
if (!generators.isEmpty()) {
Set<Entry<String, List<CandidateGenerator>>> entrySet = generators.entrySet();
for (Entry<String, List<CandidateGenerator>> entry : entrySet) {
@ -524,7 +533,7 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
this.postFilter = postFilter;
return this;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();

View File

@ -37,6 +37,7 @@ class PhraseSuggestionContext extends SuggestionContext {
private List<DirectCandidateGenerator> generators = new ArrayList<PhraseSuggestionContext.DirectCandidateGenerator>();
private int gramSize = 1;
private float confidence = 1.0f;
private int tokenLimit = NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT;
private WordScorer.WordScorerFactory scorer;
@ -153,5 +154,12 @@ class PhraseSuggestionContext extends SuggestionContext {
public boolean getRequireUnigram() {
return requireUnigram;
}
public void setTokenLimit(int tokenLimit) {
this.tokenLimit = tokenLimit;
}
public int getTokenLimit() {
return tokenLimit;
}
}

View File

@ -767,7 +767,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
.maxErrors(0.5f)
.maxErrors(0.5f).tokenLimit(5)
.size(1));
assertThat(searchSuggest, notNullValue());
@ -777,6 +777,38 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
// check tokenLimit
searchSuggest = searchSuggest(client(), "Xor the Got-Jewel",
phraseSuggestion("simple_phrase").
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
.maxErrors(0.5f)
.size(1).tokenLimit(4));
assertThat(searchSuggest, notNullValue());
assertThat(searchSuggest.size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(0));
searchSuggest = searchSuggest(client(), "Xor the Got-Jewel Xor the Got-Jewel Xor the Got-Jewel",
phraseSuggestion("simple_phrase").
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
.maxErrors(0.5f)
.size(1).tokenLimit(15));
assertThat(searchSuggest, notNullValue());
assertThat(searchSuggest.size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel Xor the Got-Jewel Xor the Got-Jewel"));
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel xorr the god jewel xorr the god jewel"));
}
@Test