Set a hard limit on the number of tokens we run suggestion on
PhraseSuggester can be very slow and CPU intensive if a lot of terms are suggested. Yet, to prevent cluster instabilty and long running requests this commit adds a hard limit of by default 10 tokens where we just return no correction for anymore if the query is parsed into more tokens. Closes #3164
This commit is contained in:
parent
9d3e34b9f9
commit
a654c3d103
|
@ -39,20 +39,24 @@ import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidat
|
|||
//TODO public for tests
|
||||
public final class NoisyChannelSpellChecker {
|
||||
public static final double REAL_WORD_LIKELYHOOD = 0.95d;
|
||||
public static final int DEFAULT_TOKEN_LIMIT = 10;
|
||||
private final double realWordLikelihood;
|
||||
private final boolean requireUnigram;
|
||||
private final int tokenLimit;
|
||||
|
||||
public NoisyChannelSpellChecker() {
|
||||
this(REAL_WORD_LIKELYHOOD);
|
||||
}
|
||||
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
|
||||
this(nonErrorLikelihood, true);
|
||||
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
|
||||
}
|
||||
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram) {
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
||||
this.realWordLikelihood = nonErrorLikelihood;
|
||||
this.requireUnigram = requireUnigram;
|
||||
this.tokenLimit = tokenLimit;
|
||||
|
||||
}
|
||||
|
||||
public Correction[] getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
|
@ -104,7 +108,7 @@ public final class NoisyChannelSpellChecker {
|
|||
}
|
||||
});
|
||||
|
||||
if (candidateSetsList.isEmpty()) {
|
||||
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
|
||||
return Correction.EMPTY;
|
||||
}
|
||||
|
||||
|
|
|
@ -76,6 +76,12 @@ public final class PhraseSuggestParser implements SuggestContextParser {
|
|||
gramSizeSet = true;
|
||||
} else if ("force_unigrams".equals(fieldName) || "forceUnigrams".equals(fieldName)) {
|
||||
suggestion.setRequireUnigram(parser.booleanValue());
|
||||
} else if ("token_limit".equals(fieldName) || "tokenLimit".equals(fieldName)) {
|
||||
int tokenLimit = parser.intValue();
|
||||
if (tokenLimit <= 0) {
|
||||
throw new ElasticSearchIllegalArgumentException("token_limit must be >= 1");
|
||||
}
|
||||
suggestion.setTokenLimit(tokenLimit);
|
||||
} else {
|
||||
throw new ElasticSearchIllegalArgumentException("suggester[phrase] doesn't support field [" + fieldName + "]");
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ public final class PhraseSuggester implements Suggester<PhraseSuggestionContext>
|
|||
}
|
||||
|
||||
|
||||
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram());
|
||||
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
|
||||
final BytesRef separator = suggestion.separator();
|
||||
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
|
||||
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestion.getField(), realWordErrorLikelihood, separator);
|
||||
|
|
|
@ -43,6 +43,7 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
|
|||
private Integer gramSize;
|
||||
private SmoothingModel model;
|
||||
private Boolean forceUnigrams;
|
||||
private Integer tokenLimit;
|
||||
|
||||
public PhraseSuggestionBuilder(String name) {
|
||||
super(name, "phrase");
|
||||
|
@ -140,6 +141,11 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
|
|||
this.model = model;
|
||||
return this;
|
||||
}
|
||||
|
||||
public PhraseSuggestionBuilder tokenLimit(int tokenLimit) {
|
||||
this.tokenLimit = tokenLimit;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
|
@ -161,6 +167,9 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
|
|||
if (forceUnigrams != null) {
|
||||
builder.field("force_unigrams", forceUnigrams);
|
||||
}
|
||||
if (tokenLimit != null) {
|
||||
builder.field("token_limit", tokenLimit);
|
||||
}
|
||||
if (!generators.isEmpty()) {
|
||||
Set<Entry<String, List<CandidateGenerator>>> entrySet = generators.entrySet();
|
||||
for (Entry<String, List<CandidateGenerator>> entry : entrySet) {
|
||||
|
@ -524,7 +533,7 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
|
|||
this.postFilter = postFilter;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject();
|
||||
|
|
|
@ -37,6 +37,7 @@ class PhraseSuggestionContext extends SuggestionContext {
|
|||
private List<DirectCandidateGenerator> generators = new ArrayList<PhraseSuggestionContext.DirectCandidateGenerator>();
|
||||
private int gramSize = 1;
|
||||
private float confidence = 1.0f;
|
||||
private int tokenLimit = NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT;
|
||||
|
||||
private WordScorer.WordScorerFactory scorer;
|
||||
|
||||
|
@ -153,5 +154,12 @@ class PhraseSuggestionContext extends SuggestionContext {
|
|||
public boolean getRequireUnigram() {
|
||||
return requireUnigram;
|
||||
}
|
||||
|
||||
public void setTokenLimit(int tokenLimit) {
|
||||
this.tokenLimit = tokenLimit;
|
||||
}
|
||||
|
||||
public int getTokenLimit() {
|
||||
return tokenLimit;
|
||||
}
|
||||
}
|
|
@ -767,7 +767,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
|
||||
.maxErrors(0.5f)
|
||||
.maxErrors(0.5f).tokenLimit(5)
|
||||
.size(1));
|
||||
|
||||
assertThat(searchSuggest, notNullValue());
|
||||
|
@ -777,6 +777,38 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
||||
|
||||
// check tokenLimit
|
||||
|
||||
searchSuggest = searchSuggest(client(), "Xor the Got-Jewel",
|
||||
phraseSuggestion("simple_phrase").
|
||||
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
|
||||
.maxErrors(0.5f)
|
||||
.size(1).tokenLimit(4));
|
||||
|
||||
assertThat(searchSuggest, notNullValue());
|
||||
assertThat(searchSuggest.size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(0));
|
||||
|
||||
searchSuggest = searchSuggest(client(), "Xor the Got-Jewel Xor the Got-Jewel Xor the Got-Jewel",
|
||||
phraseSuggestion("simple_phrase").
|
||||
realWordErrorLikelihood(0.95f).field("bigram").gramSize(2).analyzer("body")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
.smoothingModel(new PhraseSuggestionBuilder.StupidBackoff(0.1))
|
||||
.maxErrors(0.5f)
|
||||
.size(1).tokenLimit(15));
|
||||
assertThat(searchSuggest, notNullValue());
|
||||
assertThat(searchSuggest.size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel Xor the Got-Jewel Xor the Got-Jewel"));
|
||||
assertThat(searchSuggest.getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel xorr the god jewel xorr the god jewel"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue