Handle min_doc_freq in phrase suggester (#40840)

The phrase suggesters have an option to remove terms that have
a frequency lower than a provided min_doc_freq. However this value is
overwritten by the frequency of the original term in the popular mode.
This change ensures that we keep the maximum value between the provided
min_doc_value and the original term frequency as a threshold to select
candidates.

Fixes #16764
This commit is contained in:
Jim Ferenczi 2019-04-08 11:22:38 +02:00 committed by jimczi
parent fb5d7cf237
commit bc0fe7d64d
2 changed files with 78 additions and 22 deletions

View File

@ -132,31 +132,41 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
public CandidateSet drawCandidates(CandidateSet set) throws IOException {
Candidate original = set.originalTerm;
BytesRef term = preFilter(original.term, spare, byteSpare);
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
/**
* We use the {@link TermStats#docFreq} to compute the frequency threshold
* because that's what {@link DirectSpellChecker#suggestSimilar} expects
* when filtering terms.
*/
int threshold = thresholdTermFrequency(original.termStats.docFreq);
if (threshold == Integer.MAX_VALUE) {
// the threshold is the max possible frequency so we can skip the search
return set;
float origThreshold = spellchecker.getThresholdFrequency();
try {
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
/**
* We use the {@link TermStats#docFreq} to compute the frequency threshold
* because that's what {@link DirectSpellChecker#suggestSimilar} expects
* when filtering terms.
*/
int threshold = thresholdTermFrequency(original.termStats.docFreq);
if (threshold == Integer.MAX_VALUE) {
// the threshold is the max possible frequency so we can skip the search
return set;
}
// don't override the threshold if the provided min_doc_freq is greater
// than the original term frequency.
if (spellchecker.getThresholdFrequency() < threshold) {
spellchecker.setThresholdFrequency(threshold);
}
}
spellchecker.setThresholdFrequency(threshold);
}
SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode);
List<Candidate> candidates = new ArrayList<>(suggestSimilar.length);
for (int i = 0; i < suggestSimilar.length; i++) {
SuggestWord suggestWord = suggestSimilar[i];
BytesRef candidate = new BytesRef(suggestWord.string);
TermStats termStats = internalTermStats(candidate);
postFilter(new Candidate(candidate, termStats,
suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates);
SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode);
List<Candidate> candidates = new ArrayList<>(suggestSimilar.length);
for (int i = 0; i < suggestSimilar.length; i++) {
SuggestWord suggestWord = suggestSimilar[i];
BytesRef candidate = new BytesRef(suggestWord.string);
TermStats termStats = internalTermStats(candidate);
postFilter(new Candidate(candidate, termStats,
suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates);
}
set.addCandidates(candidates);
return set;
} finally {
// restore the original value back
spellchecker.setThresholdFrequency(origThreshold);
}
set.addCandidates(candidates);
return set;
}
protected BytesRef preFilter(final BytesRef term, final CharsRefBuilder spare, final BytesRefBuilder byteSpare) throws IOException {

View File

@ -1005,6 +1005,52 @@ public class SuggestSearchIT extends ESIntegTestCase {
assertSuggestion(searchSuggest, 0, "suggestion", "apple");
}
public void testPhraseSuggestMinDocFreq() throws Exception {
XContentBuilder mapping = XContentFactory.jsonBuilder()
.startObject()
.startObject("type")
.startObject("properties")
.startObject("text")
.field("type", "keyword")
.endObject()
.endObject()
.endObject()
.endObject();
assertAcked(prepareCreate("test")
.setSettings(Settings.builder().put("index.number_of_shards", 1).build())
.addMapping("type", mapping));
List<IndexRequestBuilder> builders = new ArrayList<>();
builders.add(client().prepareIndex("test", "type").setSource("text", "apple"));
builders.add(client().prepareIndex("test", "type").setSource("text", "apple"));
builders.add(client().prepareIndex("test", "type").setSource("text", "apple"));
builders.add(client().prepareIndex("test", "type").setSource("text", "appfle"));
indexRandom(true, false, builders);
PhraseSuggestionBuilder phraseSuggest = phraseSuggestion("text").text("appple")
.size(2)
.addCandidateGenerator(new DirectCandidateGeneratorBuilder("text")
.suggestMode("popular"));
Suggest searchSuggest = searchSuggest("suggestion", phraseSuggest);
assertSuggestion(searchSuggest, 0, "suggestion", 2, "apple", "appfle");
phraseSuggest = phraseSuggestion("text").text("appple")
.addCandidateGenerator(new DirectCandidateGeneratorBuilder("text")
.suggestMode("popular")
.minDocFreq(2));
searchSuggest = searchSuggest("suggestion", phraseSuggest);
assertSuggestion(searchSuggest, 0, "suggestion", 1,"apple");
phraseSuggest = phraseSuggestion("text").text("appple")
.addCandidateGenerator(new DirectCandidateGeneratorBuilder("text")
.suggestMode("popular")
.minDocFreq(2));
searchSuggest = searchSuggest("suggestion", phraseSuggest);
assertSuggestion(searchSuggest, 0, "suggestion", 1,"apple");
}
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(DummyTemplatePlugin.class);