From bc0fe7d64d0de8544600b669fcbbe6afd16513bc Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 8 Apr 2019 11:22:38 +0200 Subject: [PATCH] Handle min_doc_freq in phrase suggester (#40840) The phrase suggesters have an option to remove terms that have a frequency lower than a provided min_doc_freq. However this value is overwritten by the frequency of the original term in the popular mode. This change ensures that we keep the maximum value between the provided min_doc_value and the original term frequency as a threshold to select candidates. Fixes #16764 --- .../phrase/DirectCandidateGenerator.java | 54 +++++++++++-------- .../search/suggest/SuggestSearchIT.java | 46 ++++++++++++++++ 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index 9d9721ab046..381812104ba 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -132,31 +132,41 @@ public final class DirectCandidateGenerator extends CandidateGenerator { public CandidateSet drawCandidates(CandidateSet set) throws IOException { Candidate original = set.originalTerm; BytesRef term = preFilter(original.term, spare, byteSpare); - if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { - /** - * We use the {@link TermStats#docFreq} to compute the frequency threshold - * because that's what {@link DirectSpellChecker#suggestSimilar} expects - * when filtering terms. - */ - int threshold = thresholdTermFrequency(original.termStats.docFreq); - if (threshold == Integer.MAX_VALUE) { - // the threshold is the max possible frequency so we can skip the search - return set; + float origThreshold = spellchecker.getThresholdFrequency(); + try { + if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { + /** + * We use the {@link TermStats#docFreq} to compute the frequency threshold + * because that's what {@link DirectSpellChecker#suggestSimilar} expects + * when filtering terms. + */ + int threshold = thresholdTermFrequency(original.termStats.docFreq); + if (threshold == Integer.MAX_VALUE) { + // the threshold is the max possible frequency so we can skip the search + return set; + } + // don't override the threshold if the provided min_doc_freq is greater + // than the original term frequency. + if (spellchecker.getThresholdFrequency() < threshold) { + spellchecker.setThresholdFrequency(threshold); + } } - spellchecker.setThresholdFrequency(threshold); - } - SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode); - List candidates = new ArrayList<>(suggestSimilar.length); - for (int i = 0; i < suggestSimilar.length; i++) { - SuggestWord suggestWord = suggestSimilar[i]; - BytesRef candidate = new BytesRef(suggestWord.string); - TermStats termStats = internalTermStats(candidate); - postFilter(new Candidate(candidate, termStats, - suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates); + SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode); + List candidates = new ArrayList<>(suggestSimilar.length); + for (int i = 0; i < suggestSimilar.length; i++) { + SuggestWord suggestWord = suggestSimilar[i]; + BytesRef candidate = new BytesRef(suggestWord.string); + TermStats termStats = internalTermStats(candidate); + postFilter(new Candidate(candidate, termStats, + suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates); + } + set.addCandidates(candidates); + return set; + } finally { + // restore the original value back + spellchecker.setThresholdFrequency(origThreshold); } - set.addCandidates(candidates); - return set; } protected BytesRef preFilter(final BytesRef term, final CharsRefBuilder spare, final BytesRefBuilder byteSpare) throws IOException { diff --git a/server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java b/server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java index d8c2cce0df1..1813f17141b 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java @@ -1005,6 +1005,52 @@ public class SuggestSearchIT extends ESIntegTestCase { assertSuggestion(searchSuggest, 0, "suggestion", "apple"); } + public void testPhraseSuggestMinDocFreq() throws Exception { + XContentBuilder mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("type") + .startObject("properties") + .startObject("text") + .field("type", "keyword") + .endObject() + .endObject() + .endObject() + .endObject(); + assertAcked(prepareCreate("test") + .setSettings(Settings.builder().put("index.number_of_shards", 1).build()) + .addMapping("type", mapping)); + + List builders = new ArrayList<>(); + builders.add(client().prepareIndex("test", "type").setSource("text", "apple")); + builders.add(client().prepareIndex("test", "type").setSource("text", "apple")); + builders.add(client().prepareIndex("test", "type").setSource("text", "apple")); + builders.add(client().prepareIndex("test", "type").setSource("text", "appfle")); + indexRandom(true, false, builders); + + PhraseSuggestionBuilder phraseSuggest = phraseSuggestion("text").text("appple") + .size(2) + .addCandidateGenerator(new DirectCandidateGeneratorBuilder("text") + .suggestMode("popular")); + + Suggest searchSuggest = searchSuggest("suggestion", phraseSuggest); + assertSuggestion(searchSuggest, 0, "suggestion", 2, "apple", "appfle"); + + phraseSuggest = phraseSuggestion("text").text("appple") + .addCandidateGenerator(new DirectCandidateGeneratorBuilder("text") + .suggestMode("popular") + .minDocFreq(2)); + + searchSuggest = searchSuggest("suggestion", phraseSuggest); + assertSuggestion(searchSuggest, 0, "suggestion", 1,"apple"); + + phraseSuggest = phraseSuggestion("text").text("appple") + .addCandidateGenerator(new DirectCandidateGeneratorBuilder("text") + .suggestMode("popular") + .minDocFreq(2)); + searchSuggest = searchSuggest("suggestion", phraseSuggest); + assertSuggestion(searchSuggest, 0, "suggestion", 1,"apple"); + } + @Override protected Collection> nodePlugins() { return Collections.singleton(DummyTemplatePlugin.class);